Setup

# global options
knitr::opts_chunk$set(echo = TRUE)
# set working directory
setwd("../coinoculation-endophytes")

Preface

This markdown follows the DADA2 Pipeline Tutorial (1.16) (Benjamin Callahan) and sometimes code, comments, and text are entirely copied from the tutorial and pasted here. Modifications were made for these data and this study. Further down, I start a differential abundance analysis with DESeq2. The workflow I follow is also derived from this tutorial by Meeta Mistry, Mary Piper, Jihe Liu, and Radhika Khetani (2021).

DADA2 pipeline with my amplicon data

Here I will import my forward and reverse reads from the Fluidigm_2020121 V3-V4 amplicon data to DADA2. Reads are already demultiplexed by the sequencing center.

Getting started

Load packages

library(dada2); packageVersion("dada2") # for ASV sample inference
## Loading required package: Rcpp
## [1] '1.30.0'
library(cowplot); packageVersion("cowplot") # for plotting
## [1] '1.1.1'
library(phyloseq); packageVersion("phyloseq") # to create sequence objects
## [1] '1.46.0'
library(DECIPHER); packageVersion("DECIPHER") # for assigning taxonomy
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.3.2
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
## 
##     distance
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.2
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: RSQLite
## Loading required package: parallel
## [1] '2.30.0'
library(Biostrings); packageVersion("Biostrings") # for manipulating sequences
## [1] '2.70.1'
library(tidyverse); packageVersion("tidyverse") # includes ggplot2, dplyr, readr, stringr
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine()      masks BiocGenerics::combine()
## ✖ purrr::compact()      masks XVector::compact()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks XVector::slice(), IRanges::slice()
## ✖ lubridate::stamp()    masks cowplot::stamp()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [1] '2.0.0'
library(pheatmap); packageVersion("pheatmap") # to make a pretty heatmap
## [1] '1.0.12'
library(RColorBrewer); packageVersion("RColorBrewer") # for extra color palettes
## [1] '1.1.3'
library(viridis); packageVersion("viridis") # for extra color palettes
## Loading required package: viridisLite
## [1] '0.6.4'
library(DESeq2); packageVersion("DESeq2") # for differential abundance analysis
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Warning: package 'SummarizedExperiment' was built under R version 4.3.2
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## The following object is masked from 'package:phyloseq':
## 
##     sampleNames
## [1] '1.42.0'
library(edgeR); packageVersion("edgeR") # normalization tools
## Warning: package 'edgeR' was built under R version 4.3.2
## Loading required package: limma
## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
## [1] '4.0.2'
library(ggrepel); packageVersion("ggrepel") # for plots
## [1] '0.9.4'

Define path to reads

path <- "./16S/V3_F357_N_V4_R805" # Change this to the path of your local version of the 16S amplicon .fastq reads
list.files(path)
##   [1] "filtered"                                             
##   [2] "unmatchedIndex.fastq"                                 
##   [3] "V3_F357_N_V4_R805-141_1_N_N_GTGTATGCGT_R1.fastq"      
##   [4] "V3_F357_N_V4_R805-141_1_N_N_GTGTATGCGT_R2.fastq"      
##   [5] "V3_F357_N_V4_R805-141_1_N_S_GTATCGTCGT_R1.fastq"      
##   [6] "V3_F357_N_V4_R805-141_1_N_S_GTATCGTCGT_R2.fastq"      
##   [7] "V3_F357_N_V4_R805-141_1_R_N_GTCGTCGTCT_R1.fastq"      
##   [8] "V3_F357_N_V4_R805-141_1_R_N_GTCGTCGTCT_R2.fastq"      
##   [9] "V3_F357_N_V4_R805-141_1_R_S_TGCTCGTAGT_R1.fastq"      
##  [10] "V3_F357_N_V4_R805-141_1_R_S_TGCTCGTAGT_R2.fastq"      
##  [11] "V3_F357_N_V4_R805-141_2_N_N_CGTGCTGTCA_R1.fastq"      
##  [12] "V3_F357_N_V4_R805-141_2_N_N_CGTGCTGTCA_R2.fastq"      
##  [13] "V3_F357_N_V4_R805-141_2_N_S_GAGCTAGTGA_R1.fastq"      
##  [14] "V3_F357_N_V4_R805-141_2_N_S_GAGCTAGTGA_R2.fastq"      
##  [15] "V3_F357_N_V4_R805-141_2_R_N_GTGCTGTCGT_R1.fastq"      
##  [16] "V3_F357_N_V4_R805-141_2_R_N_GTGCTGTCGT_R2.fastq"      
##  [17] "V3_F357_N_V4_R805-141_2_R_S_GATCGTCTCT_R1.fastq"      
##  [18] "V3_F357_N_V4_R805-141_2_R_S_GATCGTCTCT_R2.fastq"      
##  [19] "V3_F357_N_V4_R805-141_5_N_N_CGCTATCAGT_R1.fastq"      
##  [20] "V3_F357_N_V4_R805-141_5_N_N_CGCTATCAGT_R2.fastq"      
##  [21] "V3_F357_N_V4_R805-141_5_N_S_GAGTGATCGT_R1.fastq"      
##  [22] "V3_F357_N_V4_R805-141_5_N_S_GAGTGATCGT_R2.fastq"      
##  [23] "V3_F357_N_V4_R805-141_5_R_N_GCTAGTGAGT_R1.fastq"      
##  [24] "V3_F357_N_V4_R805-141_5_R_N_GCTAGTGAGT_R2.fastq"      
##  [25] "V3_F357_N_V4_R805-141_5_R_S_CGCTGTAGTC_R1.fastq"      
##  [26] "V3_F357_N_V4_R805-141_5_R_S_CGCTGTAGTC_R2.fastq"      
##  [27] "V3_F357_N_V4_R805-141_7_N_N_GCGTCGTGTA_R1.fastq"      
##  [28] "V3_F357_N_V4_R805-141_7_N_N_GCGTCGTGTA_R2.fastq"      
##  [29] "V3_F357_N_V4_R805-141_7_N_S_GTGCGTGTGT_R1.fastq"      
##  [30] "V3_F357_N_V4_R805-141_7_N_S_GTGCGTGTGT_R2.fastq"      
##  [31] "V3_F357_N_V4_R805-141_7_R_N_GATGTAGCGT_R1.fastq"      
##  [32] "V3_F357_N_V4_R805-141_7_R_N_GATGTAGCGT_R2.fastq"      
##  [33] "V3_F357_N_V4_R805-141_7_R_S_GTCGTGTACT_R1.fastq"      
##  [34] "V3_F357_N_V4_R805-141_7_R_S_GTCGTGTACT_R2.fastq"      
##  [35] "V3_F357_N_V4_R805-717_10_N_N_GTGAGAGACA_R1.fastq"     
##  [36] "V3_F357_N_V4_R805-717_10_N_N_GTGAGAGACA_R2.fastq"     
##  [37] "V3_F357_N_V4_R805-717_10_N_S_TACATCGCTG_R1.fastq"     
##  [38] "V3_F357_N_V4_R805-717_10_N_S_TACATCGCTG_R2.fastq"     
##  [39] "V3_F357_N_V4_R805-717_10_R_N_GCACGTAGCT_R1.fastq"     
##  [40] "V3_F357_N_V4_R805-717_10_R_N_GCACGTAGCT_R2.fastq"     
##  [41] "V3_F357_N_V4_R805-717_10_R_S_GACTGTACGT_R1.fastq"     
##  [42] "V3_F357_N_V4_R805-717_10_R_S_GACTGTACGT_R2.fastq"     
##  [43] "V3_F357_N_V4_R805-717_5_N_N_TAGTAGCGCG_R1.fastq"      
##  [44] "V3_F357_N_V4_R805-717_5_N_N_TAGTAGCGCG_R2.fastq"      
##  [45] "V3_F357_N_V4_R805-717_5_N_S_TACTGAGCTG_R1.fastq"      
##  [46] "V3_F357_N_V4_R805-717_5_N_S_TACTGAGCTG_R2.fastq"      
##  [47] "V3_F357_N_V4_R805-717_5_R_N_GTACTCGCGA_R1.fastq"      
##  [48] "V3_F357_N_V4_R805-717_5_R_N_GTACTCGCGA_R2.fastq"      
##  [49] "V3_F357_N_V4_R805-717_5_R_S_GACGTCTGCT_R1.fastq"      
##  [50] "V3_F357_N_V4_R805-717_5_R_S_GACGTCTGCT_R2.fastq"      
##  [51] "V3_F357_N_V4_R805-717_6_N_N_CGTACTACGT_R1.fastq"      
##  [52] "V3_F357_N_V4_R805-717_6_N_N_CGTACTACGT_R2.fastq"      
##  [53] "V3_F357_N_V4_R805-717_6_N_S_TCACGCTATG_R1.fastq"      
##  [54] "V3_F357_N_V4_R805-717_6_N_S_TCACGCTATG_R2.fastq"      
##  [55] "V3_F357_N_V4_R805-717_6_R_N_GAGATCAGTC_R1.fastq"      
##  [56] "V3_F357_N_V4_R805-717_6_R_N_GAGATCAGTC_R2.fastq"      
##  [57] "V3_F357_N_V4_R805-717_6_R_S_CAGCTGAGTA_R1.fastq"      
##  [58] "V3_F357_N_V4_R805-717_6_R_S_CAGCTGAGTA_R2.fastq"      
##  [59] "V3_F357_N_V4_R805-717_9_N_N_TAGACGTGCT_R1.fastq"      
##  [60] "V3_F357_N_V4_R805-717_9_N_N_TAGACGTGCT_R2.fastq"      
##  [61] "V3_F357_N_V4_R805-717_9_N_S_TCTGAGCGCA_R1.fastq"      
##  [62] "V3_F357_N_V4_R805-717_9_N_S_TCTGAGCGCA_R2.fastq"      
##  [63] "V3_F357_N_V4_R805-717_9_R_N_TCGAGTAGCG_R1.fastq"      
##  [64] "V3_F357_N_V4_R805-717_9_R_N_TCGAGTAGCG_R2.fastq"      
##  [65] "V3_F357_N_V4_R805-717_9_R_S_GTGACTCGTC_R1.fastq"      
##  [66] "V3_F357_N_V4_R805-717_9_R_S_GTGACTCGTC_R2.fastq"      
##  [67] "V3_F357_N_V4_R805-733_10_N_N_TAGTCTGTCA_R1.fastq"     
##  [68] "V3_F357_N_V4_R805-733_10_N_N_TAGTCTGTCA_R2.fastq"     
##  [69] "V3_F357_N_V4_R805-733_10_N_S_CGTATGATGT_R1.fastq"     
##  [70] "V3_F357_N_V4_R805-733_10_N_S_CGTATGATGT_R2.fastq"     
##  [71] "V3_F357_N_V4_R805-733_10_R_N_CTAGAGTATC_R1.fastq"     
##  [72] "V3_F357_N_V4_R805-733_10_R_N_CTAGAGTATC_R2.fastq"     
##  [73] "V3_F357_N_V4_R805-733_10_R_S_TGTCTCTATC_R1.fastq"     
##  [74] "V3_F357_N_V4_R805-733_10_R_S_TGTCTCTATC_R2.fastq"     
##  [75] "V3_F357_N_V4_R805-733_4_N_N_TATCGATGCT_R1.fastq"      
##  [76] "V3_F357_N_V4_R805-733_4_N_N_TATCGATGCT_R2.fastq"      
##  [77] "V3_F357_N_V4_R805-733_4_N_S_TGTGTCACTA_R1.fastq"      
##  [78] "V3_F357_N_V4_R805-733_4_N_S_TGTGTCACTA_R2.fastq"      
##  [79] "V3_F357_N_V4_R805-733_4_R_N_CATGCATCAT_R1.fastq"      
##  [80] "V3_F357_N_V4_R805-733_4_R_N_CATGCATCAT_R2.fastq"      
##  [81] "V3_F357_N_V4_R805-733_4_R_S_TAGAGTCTGT_R1.fastq"      
##  [82] "V3_F357_N_V4_R805-733_4_R_S_TAGAGTCTGT_R2.fastq"      
##  [83] "V3_F357_N_V4_R805-733_8_N_N_CGTCTATGAT_R1.fastq"      
##  [84] "V3_F357_N_V4_R805-733_8_N_N_CGTCTATGAT_R2.fastq"      
##  [85] "V3_F357_N_V4_R805-733_8_N_S_TGATCAGTCA_R1.fastq"      
##  [86] "V3_F357_N_V4_R805-733_8_N_S_TGATCAGTCA_R2.fastq"      
##  [87] "V3_F357_N_V4_R805-733_8_R_N_CTAGATCTGA_R1.fastq"      
##  [88] "V3_F357_N_V4_R805-733_8_R_N_CTAGATCTGA_R2.fastq"      
##  [89] "V3_F357_N_V4_R805-733_8_R_S_GTGATACTGA_R1.fastq"      
##  [90] "V3_F357_N_V4_R805-733_8_R_S_GTGATACTGA_R2.fastq"      
##  [91] "V3_F357_N_V4_R805-733_9_N_N_CATGAGTGTA_R1.fastq"      
##  [92] "V3_F357_N_V4_R805-733_9_N_N_CATGAGTGTA_R2.fastq"      
##  [93] "V3_F357_N_V4_R805-733_9_N_S_TATCATGTGC_R1.fastq"      
##  [94] "V3_F357_N_V4_R805-733_9_N_S_TATCATGTGC_R2.fastq"      
##  [95] "V3_F357_N_V4_R805-733_9_R_N_TATCTCATGC_R1.fastq"      
##  [96] "V3_F357_N_V4_R805-733_9_R_N_TATCTCATGC_R2.fastq"      
##  [97] "V3_F357_N_V4_R805-733_9_R_S_TGTCGTCATA_R1.fastq"      
##  [98] "V3_F357_N_V4_R805-733_9_R_S_TGTCGTCATA_R2.fastq"      
##  [99] "V3_F357_N_V4_R805-KIT_NEG_CTRL_CGAGTGCTGT_R1.fastq"   
## [100] "V3_F357_N_V4_R805-KIT_NEG_CTRL_CGAGTGCTGT_R2.fastq"   
## [101] "V3_F357_N_V4_R805-KIT_NEG_CTRL2_TCAGATGCTA_R1.fastq"  
## [102] "V3_F357_N_V4_R805-KIT_NEG_CTRL2_TCAGATGCTA_R2.fastq"  
## [103] "V3_F357_N_V4_R805-WATER_NEG_CTRL_GTATGAGCAC_R1.fastq" 
## [104] "V3_F357_N_V4_R805-WATER_NEG_CTRL_GTATGAGCAC_R2.fastq" 
## [105] "V3_F357_N_V4_R805-WATER_NEG_CTRL2_TATCAGTCTG_R1.fastq"
## [106] "V3_F357_N_V4_R805-WATER_NEG_CTRL2_TATCAGTCTG_R2.fastq"
## [107] "V3_F357_N_V4_R805-WaterFG_TATAGCACGC_R1.fastq"        
## [108] "V3_F357_N_V4_R805-WaterFG_TATAGCACGC_R2.fastq"        
## [109] "V3_F357_N_V4_R805-WaterFG2_TATGTACGTG_R1.fastq"       
## [110] "V3_F357_N_V4_R805-WaterFG2_TATGTACGTG_R2.fastq"

Reads representing samples co-inoculated strains MAG522 and MAG702A are located within the folder “excluded_amplicon_samples” and are not included here, since the scope of the paper only pertains to 141, 717A, and 733B.

Now we read in the names of the fastq files, and perform some string manipulation to get matched lists of the forward and reverse fastq files.

# Forward and reverse fastq filenames have format: V3_F357_N_V4_R805-SAMPLENAME_BARCODE_R1.fastq and V3_F357_N_V4_R805-SAMPLENAME_BARCODE_R2.fastq
fnFs <- sort(list.files(path, pattern="_R1.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have the above format:
sample.names <- sapply(strsplit(basename(fnFs), "-"), `[`, 2)
sample.names <- sapply(strsplit(basename(sample.names), "_R1"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_T"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_A"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_G"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CA"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CG"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CC"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTA"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTT"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTG"), `[`, 1)
sample.names <- sapply(strsplit(basename(sample.names), "_CTC"), `[`, 1)
sample.names
##  [1] "141_1_N_N"       "141_1_N_S"       "141_1_R_N"       "141_1_R_S"      
##  [5] "141_2_N_N"       "141_2_N_S"       "141_2_R_N"       "141_2_R_S"      
##  [9] "141_5_N_N"       "141_5_N_S"       "141_5_R_N"       "141_5_R_S"      
## [13] "141_7_N_N"       "141_7_N_S"       "141_7_R_N"       "141_7_R_S"      
## [17] "717_10_N_N"      "717_10_N_S"      "717_10_R_N"      "717_10_R_S"     
## [21] "717_5_N_N"       "717_5_N_S"       "717_5_R_N"       "717_5_R_S"      
## [25] "717_6_N_N"       "717_6_N_S"       "717_6_R_N"       "717_6_R_S"      
## [29] "717_9_N_N"       "717_9_N_S"       "717_9_R_N"       "717_9_R_S"      
## [33] "733_10_N_N"      "733_10_N_S"      "733_10_R_N"      "733_10_R_S"     
## [37] "733_4_N_N"       "733_4_N_S"       "733_4_R_N"       "733_4_R_S"      
## [41] "733_8_N_N"       "733_8_N_S"       "733_8_R_N"       "733_8_R_S"      
## [45] "733_9_N_N"       "733_9_N_S"       "733_9_R_N"       "733_9_R_S"      
## [49] "KIT_NEG_CTRL"    "KIT_NEG_CTRL2"   "WATER_NEG_CTRL"  "WATER_NEG_CTRL2"
## [53] "WaterFG"         "WaterFG2"

Inspect read quality profiles

We start by visualizing the quality profiles of the forward reads:

forward_quality_plots <- plotQualityProfile(fnFs[1:86])
save_plot("./16S/forward_quality_plots.png", forward_quality_plots, base_height=12, base_width=24)
forward_quality_plots

In gray-scale is a heat map of the frequency of each quality score at each base position. The mean quality score at each position is shown by the green line, and the quartiles of the quality score distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least that position (this is more useful for other sequencing technologies, as Illumina reads are typically all the same length, hence the flat red line).

All forward reads seem to be of high quality (except the control samples). Even the end of the sequences all seem to be above or near 30, so I will not truncate the tails.

Now we visualize the quality profile of the reverse reads:

reverse_quality_plots <- plotQualityProfile(fnRs[1:86])
save_plot("./16S/reverse_quality_plot.png", reverse_quality_plots, base_height=12, base_width=24)
reverse_quality_plots

All reverse reads except controls also seem to be of high quality.

Filter and trim

Assign the filenames for the filtered fastq.gz files.

# Place filtered files in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names

Barcodes and primers

Sequence reads in file.path were already demultiplexed by the sequencing center, so no barcodes should be present. There are, however, two sets of primers on each read mate. The Fluidigm amplicon structure results in reads with a CS (Fluidigm-specific) primer and the V3-V4 primer. Both need to be removed from each end. According to the sequencing results from the W. M. Keck Center:

fwd16Sprimer <- "CCTACGGGNGGCWGCAG"
rev16Sprimer <- "GACTACHVGGGTATCTAATCC"
CS1primer <- "AGACCAAGTCTCTGC"
CS2primer <- "TGTAGAACCATGTC"
fwdprimerlen <- nchar(fwd16Sprimer) + nchar(CS1primer)
revprimerlen <- nchar(rev16Sprimer) + nchar(CS2primer)

Filtering parameters: The standard filtering parameters (maxN=0 (DADA2 requires no N’s), truncQ=2, rm.phix=TRUE and maxEE=2) are starting points, not set in stone. If you want to speed up downstream computation, consider tightening maxEE. If too few reads are passing the filter, consider relaxing maxEE, perhaps especially on the reverse reads (eg. maxEE=c(2,5)), and reducing the truncLen to remove low quality tails. Remember though, when choosing truncLen for paired-end reads you must maintain overlap after truncation in order to merge them later.

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
              trimLeft = c(fwdprimerlen, revprimerlen), # removes primers (FWD_PRIMER_LEN, REV_PRIMER_LEN)
              truncLen=c(0,0), # Removes  tails: first number is truncation site for forwards reads, second is for reverse reads
              maxN=0, # After truncation, sequences with more than maxN Ns will be discarded (dada2 does not allow Ns)
              maxEE=c(2,5), # Sets the maximum number of “expected errors” allowed in a read (forward, reverse)
              truncQ=2, # Truncate reads at the first instance of a quality score less than or equal to truncQ
              # minQ=30, # Only keep reads with a quality score > 30 (high-quality)
              rm.phix=TRUE,
              compress=TRUE, verbose=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
out
##                                                       reads.in reads.out
## V3_F357_N_V4_R805-141_1_N_N_GTGTATGCGT_R1.fastq           5473      4890
## V3_F357_N_V4_R805-141_1_N_S_GTATCGTCGT_R1.fastq           3995      3605
## V3_F357_N_V4_R805-141_1_R_N_GTCGTCGTCT_R1.fastq           3115      2767
## V3_F357_N_V4_R805-141_1_R_S_TGCTCGTAGT_R1.fastq           3535      3159
## V3_F357_N_V4_R805-141_2_N_N_CGTGCTGTCA_R1.fastq           7526      6747
## V3_F357_N_V4_R805-141_2_N_S_GAGCTAGTGA_R1.fastq           8069      7176
## V3_F357_N_V4_R805-141_2_R_N_GTGCTGTCGT_R1.fastq           7526      6677
## V3_F357_N_V4_R805-141_2_R_S_GATCGTCTCT_R1.fastq           3975      3473
## V3_F357_N_V4_R805-141_5_N_N_CGCTATCAGT_R1.fastq           6377      5680
## V3_F357_N_V4_R805-141_5_N_S_GAGTGATCGT_R1.fastq           4097      3651
## V3_F357_N_V4_R805-141_5_R_N_GCTAGTGAGT_R1.fastq           5271      4688
## V3_F357_N_V4_R805-141_5_R_S_CGCTGTAGTC_R1.fastq           5682      4987
## V3_F357_N_V4_R805-141_7_N_N_GCGTCGTGTA_R1.fastq           7335      6646
## V3_F357_N_V4_R805-141_7_N_S_GTGCGTGTGT_R1.fastq           3881      3522
## V3_F357_N_V4_R805-141_7_R_N_GATGTAGCGT_R1.fastq           4423      3943
## V3_F357_N_V4_R805-141_7_R_S_GTCGTGTACT_R1.fastq           4497      3954
## V3_F357_N_V4_R805-717_10_N_N_GTGAGAGACA_R1.fastq         10464      9461
## V3_F357_N_V4_R805-717_10_N_S_TACATCGCTG_R1.fastq          8360      7529
## V3_F357_N_V4_R805-717_10_R_N_GCACGTAGCT_R1.fastq          8229      7225
## V3_F357_N_V4_R805-717_10_R_S_GACTGTACGT_R1.fastq          6610      5949
## V3_F357_N_V4_R805-717_5_N_N_TAGTAGCGCG_R1.fastq           7726      7020
## V3_F357_N_V4_R805-717_5_N_S_TACTGAGCTG_R1.fastq           7510      6754
## V3_F357_N_V4_R805-717_5_R_N_GTACTCGCGA_R1.fastq           6888      6122
## V3_F357_N_V4_R805-717_5_R_S_GACGTCTGCT_R1.fastq           5998      5234
## V3_F357_N_V4_R805-717_6_N_N_CGTACTACGT_R1.fastq           7647      6792
## V3_F357_N_V4_R805-717_6_N_S_TCACGCTATG_R1.fastq           6983      6198
## V3_F357_N_V4_R805-717_6_R_N_GAGATCAGTC_R1.fastq           7250      6418
## V3_F357_N_V4_R805-717_6_R_S_CAGCTGAGTA_R1.fastq           5063      4429
## V3_F357_N_V4_R805-717_9_N_N_TAGACGTGCT_R1.fastq           5144      4665
## V3_F357_N_V4_R805-717_9_N_S_TCTGAGCGCA_R1.fastq           6497      5781
## V3_F357_N_V4_R805-717_9_R_N_TCGAGTAGCG_R1.fastq           4916      4360
## V3_F357_N_V4_R805-717_9_R_S_GTGACTCGTC_R1.fastq           4581      4059
## V3_F357_N_V4_R805-733_10_N_N_TAGTCTGTCA_R1.fastq         10293      9289
## V3_F357_N_V4_R805-733_10_N_S_CGTATGATGT_R1.fastq         10237      9129
## V3_F357_N_V4_R805-733_10_R_N_CTAGAGTATC_R1.fastq          5429      4812
## V3_F357_N_V4_R805-733_10_R_S_TGTCTCTATC_R1.fastq          6413      5743
## V3_F357_N_V4_R805-733_4_N_N_TATCGATGCT_R1.fastq           8426      7434
## V3_F357_N_V4_R805-733_4_N_S_TGTGTCACTA_R1.fastq           7478      6840
## V3_F357_N_V4_R805-733_4_R_N_CATGCATCAT_R1.fastq           7036      6173
## V3_F357_N_V4_R805-733_4_R_S_TAGAGTCTGT_R1.fastq           3665      3274
## V3_F357_N_V4_R805-733_8_N_N_CGTCTATGAT_R1.fastq           7841      6912
## V3_F357_N_V4_R805-733_8_N_S_TGATCAGTCA_R1.fastq           7764      7038
## V3_F357_N_V4_R805-733_8_R_N_CTAGATCTGA_R1.fastq           7260      6448
## V3_F357_N_V4_R805-733_8_R_S_GTGATACTGA_R1.fastq           6018      5235
## V3_F357_N_V4_R805-733_9_N_N_CATGAGTGTA_R1.fastq           1366      1214
## V3_F357_N_V4_R805-733_9_N_S_TATCATGTGC_R1.fastq           6092      5519
## V3_F357_N_V4_R805-733_9_R_N_TATCTCATGC_R1.fastq           3648      3270
## V3_F357_N_V4_R805-733_9_R_S_TGTCGTCATA_R1.fastq           4583      4132
## V3_F357_N_V4_R805-KIT_NEG_CTRL_CGAGTGCTGT_R1.fastq          24        12
## V3_F357_N_V4_R805-KIT_NEG_CTRL2_TCAGATGCTA_R1.fastq         35        20
## V3_F357_N_V4_R805-WATER_NEG_CTRL_GTATGAGCAC_R1.fastq        35        22
## V3_F357_N_V4_R805-WATER_NEG_CTRL2_TATCAGTCTG_R1.fastq       48        18
## V3_F357_N_V4_R805-WaterFG_TATAGCACGC_R1.fastq               27        16
## V3_F357_N_V4_R805-WaterFG2_TATGTACGTG_R1.fastq              28        16

Dereplicate

The next thing we want to do is “dereplicate” the filtered fastq files. During dereplication, we condense the data by collapsing together all reads that encode the same sequence, which significantly reduces later computation times:

derepFs <- derepFastq(filtFs, verbose=TRUE)
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_N_F_filt.fastq.gz
## Encountered 1081 unique sequences from 4890 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_S_F_filt.fastq.gz
## Encountered 978 unique sequences from 3605 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_N_F_filt.fastq.gz
## Encountered 1098 unique sequences from 2767 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_S_F_filt.fastq.gz
## Encountered 821 unique sequences from 3159 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_N_F_filt.fastq.gz
## Encountered 1171 unique sequences from 6747 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_S_F_filt.fastq.gz
## Encountered 1374 unique sequences from 7176 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_N_F_filt.fastq.gz
## Encountered 1740 unique sequences from 6677 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_S_F_filt.fastq.gz
## Encountered 782 unique sequences from 3473 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_N_F_filt.fastq.gz
## Encountered 1029 unique sequences from 5680 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_S_F_filt.fastq.gz
## Encountered 897 unique sequences from 3651 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_N_F_filt.fastq.gz
## Encountered 1691 unique sequences from 4688 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_S_F_filt.fastq.gz
## Encountered 1071 unique sequences from 4987 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_N_F_filt.fastq.gz
## Encountered 1293 unique sequences from 6646 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_S_F_filt.fastq.gz
## Encountered 863 unique sequences from 3522 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_N_F_filt.fastq.gz
## Encountered 1499 unique sequences from 3943 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_S_F_filt.fastq.gz
## Encountered 832 unique sequences from 3954 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_N_F_filt.fastq.gz
## Encountered 1658 unique sequences from 9461 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_S_F_filt.fastq.gz
## Encountered 1300 unique sequences from 7529 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_N_F_filt.fastq.gz
## Encountered 3068 unique sequences from 7225 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_S_F_filt.fastq.gz
## Encountered 985 unique sequences from 5949 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_N_F_filt.fastq.gz
## Encountered 1381 unique sequences from 7020 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_S_F_filt.fastq.gz
## Encountered 1281 unique sequences from 6754 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_N_F_filt.fastq.gz
## Encountered 2109 unique sequences from 6122 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_S_F_filt.fastq.gz
## Encountered 1064 unique sequences from 5234 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_N_F_filt.fastq.gz
## Encountered 1273 unique sequences from 6792 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_S_F_filt.fastq.gz
## Encountered 1188 unique sequences from 6198 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_N_F_filt.fastq.gz
## Encountered 2419 unique sequences from 6418 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_S_F_filt.fastq.gz
## Encountered 907 unique sequences from 4429 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_N_F_filt.fastq.gz
## Encountered 1099 unique sequences from 4665 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_S_F_filt.fastq.gz
## Encountered 1221 unique sequences from 5781 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_N_F_filt.fastq.gz
## Encountered 1508 unique sequences from 4360 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_S_F_filt.fastq.gz
## Encountered 884 unique sequences from 4059 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_N_F_filt.fastq.gz
## Encountered 2059 unique sequences from 9289 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_S_F_filt.fastq.gz
## Encountered 1622 unique sequences from 9129 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_N_F_filt.fastq.gz
## Encountered 2041 unique sequences from 4812 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_S_F_filt.fastq.gz
## Encountered 1207 unique sequences from 5743 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_N_F_filt.fastq.gz
## Encountered 2192 unique sequences from 7434 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_S_F_filt.fastq.gz
## Encountered 1247 unique sequences from 6840 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_N_F_filt.fastq.gz
## Encountered 1535 unique sequences from 6173 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_S_F_filt.fastq.gz
## Encountered 827 unique sequences from 3274 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_N_F_filt.fastq.gz
## Encountered 1231 unique sequences from 6912 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_S_F_filt.fastq.gz
## Encountered 1340 unique sequences from 7038 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_N_F_filt.fastq.gz
## Encountered 1646 unique sequences from 6448 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_S_F_filt.fastq.gz
## Encountered 1432 unique sequences from 5235 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_N_F_filt.fastq.gz
## Encountered 413 unique sequences from 1214 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_S_F_filt.fastq.gz
## Encountered 1114 unique sequences from 5519 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_N_F_filt.fastq.gz
## Encountered 765 unique sequences from 3270 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_S_F_filt.fastq.gz
## Encountered 1007 unique sequences from 4132 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL_F_filt.fastq.gz
## Encountered 8 unique sequences from 12 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL2_F_filt.fastq.gz
## Encountered 12 unique sequences from 20 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL_F_filt.fastq.gz
## Encountered 14 unique sequences from 22 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL2_F_filt.fastq.gz
## Encountered 11 unique sequences from 18 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG_F_filt.fastq.gz
## Encountered 12 unique sequences from 16 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG2_F_filt.fastq.gz
## Encountered 9 unique sequences from 16 total sequences read.
derepRs <- derepFastq(filtRs, verbose=TRUE)
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_N_R_filt.fastq.gz
## Encountered 1098 unique sequences from 4890 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_N_S_R_filt.fastq.gz
## Encountered 942 unique sequences from 3605 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_N_R_filt.fastq.gz
## Encountered 1025 unique sequences from 2767 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_1_R_S_R_filt.fastq.gz
## Encountered 805 unique sequences from 3159 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_N_R_filt.fastq.gz
## Encountered 1283 unique sequences from 6747 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_N_S_R_filt.fastq.gz
## Encountered 1436 unique sequences from 7176 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_N_R_filt.fastq.gz
## Encountered 1654 unique sequences from 6677 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_2_R_S_R_filt.fastq.gz
## Encountered 788 unique sequences from 3473 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_N_R_filt.fastq.gz
## Encountered 1078 unique sequences from 5680 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_N_S_R_filt.fastq.gz
## Encountered 930 unique sequences from 3651 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_N_R_filt.fastq.gz
## Encountered 1556 unique sequences from 4688 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_5_R_S_R_filt.fastq.gz
## Encountered 1099 unique sequences from 4987 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_N_R_filt.fastq.gz
## Encountered 1305 unique sequences from 6646 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_N_S_R_filt.fastq.gz
## Encountered 819 unique sequences from 3522 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_N_R_filt.fastq.gz
## Encountered 1452 unique sequences from 3943 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/141_7_R_S_R_filt.fastq.gz
## Encountered 846 unique sequences from 3954 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_N_R_filt.fastq.gz
## Encountered 1719 unique sequences from 9461 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_N_S_R_filt.fastq.gz
## Encountered 1404 unique sequences from 7529 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_N_R_filt.fastq.gz
## Encountered 2973 unique sequences from 7225 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_10_R_S_R_filt.fastq.gz
## Encountered 1029 unique sequences from 5949 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_N_R_filt.fastq.gz
## Encountered 1465 unique sequences from 7020 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_N_S_R_filt.fastq.gz
## Encountered 1378 unique sequences from 6754 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_N_R_filt.fastq.gz
## Encountered 2014 unique sequences from 6122 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_5_R_S_R_filt.fastq.gz
## Encountered 1018 unique sequences from 5234 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_N_R_filt.fastq.gz
## Encountered 1381 unique sequences from 6792 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_N_S_R_filt.fastq.gz
## Encountered 1248 unique sequences from 6198 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_N_R_filt.fastq.gz
## Encountered 2277 unique sequences from 6418 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_6_R_S_R_filt.fastq.gz
## Encountered 985 unique sequences from 4429 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_N_R_filt.fastq.gz
## Encountered 1101 unique sequences from 4665 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_N_S_R_filt.fastq.gz
## Encountered 1243 unique sequences from 5781 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_N_R_filt.fastq.gz
## Encountered 1458 unique sequences from 4360 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/717_9_R_S_R_filt.fastq.gz
## Encountered 967 unique sequences from 4059 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_N_R_filt.fastq.gz
## Encountered 2014 unique sequences from 9289 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_N_S_R_filt.fastq.gz
## Encountered 1700 unique sequences from 9129 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_N_R_filt.fastq.gz
## Encountered 1887 unique sequences from 4812 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_10_R_S_R_filt.fastq.gz
## Encountered 1189 unique sequences from 5743 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_N_R_filt.fastq.gz
## Encountered 1741 unique sequences from 7434 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_N_S_R_filt.fastq.gz
## Encountered 1301 unique sequences from 6840 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_N_R_filt.fastq.gz
## Encountered 1524 unique sequences from 6173 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_4_R_S_R_filt.fastq.gz
## Encountered 870 unique sequences from 3274 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_N_R_filt.fastq.gz
## Encountered 1345 unique sequences from 6912 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_N_S_R_filt.fastq.gz
## Encountered 1375 unique sequences from 7038 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_N_R_filt.fastq.gz
## Encountered 1701 unique sequences from 6448 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_8_R_S_R_filt.fastq.gz
## Encountered 1599 unique sequences from 5235 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_N_R_filt.fastq.gz
## Encountered 413 unique sequences from 1214 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_N_S_R_filt.fastq.gz
## Encountered 1131 unique sequences from 5519 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_N_R_filt.fastq.gz
## Encountered 734 unique sequences from 3270 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/733_9_R_S_R_filt.fastq.gz
## Encountered 1049 unique sequences from 4132 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL_R_filt.fastq.gz
## Encountered 10 unique sequences from 12 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/KIT_NEG_CTRL2_R_filt.fastq.gz
## Encountered 13 unique sequences from 20 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL_R_filt.fastq.gz
## Encountered 15 unique sequences from 22 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WATER_NEG_CTRL2_R_filt.fastq.gz
## Encountered 11 unique sequences from 18 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG_R_filt.fastq.gz
## Encountered 11 unique sequences from 16 total sequences read.
## Dereplicating sequence entries in Fastq file: ./16S/V3_F357_N_V4_R805/filtered/WaterFG2_R_filt.fastq.gz
## Encountered 10 unique sequences from 16 total sequences read.

Dereplication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.

Learn the Error Rates

The DADA2 algorithm makes use of a parametric error model (err) and every amplicon dataset has a different set of error rates. The learnErrors method learns this error model from the data, by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution. As in many machine-learning problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors).

errF <- learnErrors(derepFs, multithread=TRUE)
## 58015566 total bases in 266127 reads from 54 samples will be used for learning the error rates.
errR <- learnErrors(derepRs, multithread=TRUE)
## 57217221 total bases in 266127 reads from 54 samples will be used for learning the error rates.

Visualize estimated error rates:

plotErrors(errF, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis

plotErrors(errR, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected. Everything looks reasonable and we proceed with confidence.

Sample Inference

We are now ready to apply the core sample inference algorithm to the filtered and trimmed sequence data.

dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
## Sample 1 - 4890 reads in 1081 unique sequences.
## Sample 2 - 3605 reads in 978 unique sequences.
## Sample 3 - 2767 reads in 1098 unique sequences.
## Sample 4 - 3159 reads in 821 unique sequences.
## Sample 5 - 6747 reads in 1171 unique sequences.
## Sample 6 - 7176 reads in 1374 unique sequences.
## Sample 7 - 6677 reads in 1740 unique sequences.
## Sample 8 - 3473 reads in 782 unique sequences.
## Sample 9 - 5680 reads in 1029 unique sequences.
## Sample 10 - 3651 reads in 897 unique sequences.
## Sample 11 - 4688 reads in 1691 unique sequences.
## Sample 12 - 4987 reads in 1071 unique sequences.
## Sample 13 - 6646 reads in 1293 unique sequences.
## Sample 14 - 3522 reads in 863 unique sequences.
## Sample 15 - 3943 reads in 1499 unique sequences.
## Sample 16 - 3954 reads in 832 unique sequences.
## Sample 17 - 9461 reads in 1658 unique sequences.
## Sample 18 - 7529 reads in 1300 unique sequences.
## Sample 19 - 7225 reads in 3068 unique sequences.
## Sample 20 - 5949 reads in 985 unique sequences.
## Sample 21 - 7020 reads in 1381 unique sequences.
## Sample 22 - 6754 reads in 1281 unique sequences.
## Sample 23 - 6122 reads in 2109 unique sequences.
## Sample 24 - 5234 reads in 1064 unique sequences.
## Sample 25 - 6792 reads in 1273 unique sequences.
## Sample 26 - 6198 reads in 1188 unique sequences.
## Sample 27 - 6418 reads in 2419 unique sequences.
## Sample 28 - 4429 reads in 907 unique sequences.
## Sample 29 - 4665 reads in 1099 unique sequences.
## Sample 30 - 5781 reads in 1221 unique sequences.
## Sample 31 - 4360 reads in 1508 unique sequences.
## Sample 32 - 4059 reads in 884 unique sequences.
## Sample 33 - 9289 reads in 2059 unique sequences.
## Sample 34 - 9129 reads in 1622 unique sequences.
## Sample 35 - 4812 reads in 2041 unique sequences.
## Sample 36 - 5743 reads in 1207 unique sequences.
## Sample 37 - 7434 reads in 2192 unique sequences.
## Sample 38 - 6840 reads in 1247 unique sequences.
## Sample 39 - 6173 reads in 1535 unique sequences.
## Sample 40 - 3274 reads in 827 unique sequences.
## Sample 41 - 6912 reads in 1231 unique sequences.
## Sample 42 - 7038 reads in 1340 unique sequences.
## Sample 43 - 6448 reads in 1646 unique sequences.
## Sample 44 - 5235 reads in 1432 unique sequences.
## Sample 45 - 1214 reads in 413 unique sequences.
## Sample 46 - 5519 reads in 1114 unique sequences.
## Sample 47 - 3270 reads in 765 unique sequences.
## Sample 48 - 4132 reads in 1007 unique sequences.
## Sample 49 - 12 reads in 8 unique sequences.
## Sample 50 - 20 reads in 12 unique sequences.
## Sample 51 - 22 reads in 14 unique sequences.
## Sample 52 - 18 reads in 11 unique sequences.
## Sample 53 - 16 reads in 12 unique sequences.
## Sample 54 - 16 reads in 9 unique sequences.
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)
## Sample 1 - 4890 reads in 1098 unique sequences.
## Sample 2 - 3605 reads in 942 unique sequences.
## Sample 3 - 2767 reads in 1025 unique sequences.
## Sample 4 - 3159 reads in 805 unique sequences.
## Sample 5 - 6747 reads in 1283 unique sequences.
## Sample 6 - 7176 reads in 1436 unique sequences.
## Sample 7 - 6677 reads in 1654 unique sequences.
## Sample 8 - 3473 reads in 788 unique sequences.
## Sample 9 - 5680 reads in 1078 unique sequences.
## Sample 10 - 3651 reads in 930 unique sequences.
## Sample 11 - 4688 reads in 1556 unique sequences.
## Sample 12 - 4987 reads in 1099 unique sequences.
## Sample 13 - 6646 reads in 1305 unique sequences.
## Sample 14 - 3522 reads in 819 unique sequences.
## Sample 15 - 3943 reads in 1452 unique sequences.
## Sample 16 - 3954 reads in 846 unique sequences.
## Sample 17 - 9461 reads in 1719 unique sequences.
## Sample 18 - 7529 reads in 1404 unique sequences.
## Sample 19 - 7225 reads in 2973 unique sequences.
## Sample 20 - 5949 reads in 1029 unique sequences.
## Sample 21 - 7020 reads in 1465 unique sequences.
## Sample 22 - 6754 reads in 1378 unique sequences.
## Sample 23 - 6122 reads in 2014 unique sequences.
## Sample 24 - 5234 reads in 1018 unique sequences.
## Sample 25 - 6792 reads in 1381 unique sequences.
## Sample 26 - 6198 reads in 1248 unique sequences.
## Sample 27 - 6418 reads in 2277 unique sequences.
## Sample 28 - 4429 reads in 985 unique sequences.
## Sample 29 - 4665 reads in 1101 unique sequences.
## Sample 30 - 5781 reads in 1243 unique sequences.
## Sample 31 - 4360 reads in 1458 unique sequences.
## Sample 32 - 4059 reads in 967 unique sequences.
## Sample 33 - 9289 reads in 2014 unique sequences.
## Sample 34 - 9129 reads in 1700 unique sequences.
## Sample 35 - 4812 reads in 1887 unique sequences.
## Sample 36 - 5743 reads in 1189 unique sequences.
## Sample 37 - 7434 reads in 1741 unique sequences.
## Sample 38 - 6840 reads in 1301 unique sequences.
## Sample 39 - 6173 reads in 1524 unique sequences.
## Sample 40 - 3274 reads in 870 unique sequences.
## Sample 41 - 6912 reads in 1345 unique sequences.
## Sample 42 - 7038 reads in 1375 unique sequences.
## Sample 43 - 6448 reads in 1701 unique sequences.
## Sample 44 - 5235 reads in 1599 unique sequences.
## Sample 45 - 1214 reads in 413 unique sequences.
## Sample 46 - 5519 reads in 1131 unique sequences.
## Sample 47 - 3270 reads in 734 unique sequences.
## Sample 48 - 4132 reads in 1049 unique sequences.
## Sample 49 - 12 reads in 10 unique sequences.
## Sample 50 - 20 reads in 13 unique sequences.
## Sample 51 - 22 reads in 15 unique sequences.
## Sample 52 - 18 reads in 11 unique sequences.
## Sample 53 - 16 reads in 11 unique sequences.
## Sample 54 - 16 reads in 10 unique sequences.
saveRDS(dadaFs, file = "./16S/dadaFs.RDS")
saveRDS(dadaRs, file = "./16S/dadaRs.RDS")

Inspecting the returned dada-class objects:

dadaFs[[1]]
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1081 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
dadaRs[[1]]
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1098 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

Load dada-class objects

readRDS(file = "./16S/dadaFs.RDS")
## $`141_1_N_N`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1081 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_1_N_S`
## dada-class: object describing DADA2 denoising results
## 10 sequence variants were inferred from 978 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_1_R_N`
## dada-class: object describing DADA2 denoising results
## 30 sequence variants were inferred from 1098 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_1_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 821 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_N_N`
## dada-class: object describing DADA2 denoising results
## 15 sequence variants were inferred from 1171 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_N_S`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1374 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_R_N`
## dada-class: object describing DADA2 denoising results
## 34 sequence variants were inferred from 1740 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 782 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_N_N`
## dada-class: object describing DADA2 denoising results
## 15 sequence variants were inferred from 1029 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 897 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_R_N`
## dada-class: object describing DADA2 denoising results
## 52 sequence variants were inferred from 1691 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_R_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 1071 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_N_N`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1293 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_N_S`
## dada-class: object describing DADA2 denoising results
## 12 sequence variants were inferred from 863 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_R_N`
## dada-class: object describing DADA2 denoising results
## 44 sequence variants were inferred from 1499 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 832 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_N_N`
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 1658 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_N_S`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1300 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_R_N`
## dada-class: object describing DADA2 denoising results
## 88 sequence variants were inferred from 3068 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 985 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_N_N`
## dada-class: object describing DADA2 denoising results
## 19 sequence variants were inferred from 1381 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1281 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_R_N`
## dada-class: object describing DADA2 denoising results
## 50 sequence variants were inferred from 2109 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_R_S`
## dada-class: object describing DADA2 denoising results
## 10 sequence variants were inferred from 1064 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_N_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1273 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 1188 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_R_N`
## dada-class: object describing DADA2 denoising results
## 62 sequence variants were inferred from 2419 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_R_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 907 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_N_N`
## dada-class: object describing DADA2 denoising results
## 26 sequence variants were inferred from 1099 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1221 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_R_N`
## dada-class: object describing DADA2 denoising results
## 39 sequence variants were inferred from 1508 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 884 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_N_N`
## dada-class: object describing DADA2 denoising results
## 39 sequence variants were inferred from 2059 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_N_S`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1622 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_R_N`
## dada-class: object describing DADA2 denoising results
## 59 sequence variants were inferred from 2041 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_R_S`
## dada-class: object describing DADA2 denoising results
## 16 sequence variants were inferred from 1207 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_N_N`
## dada-class: object describing DADA2 denoising results
## 34 sequence variants were inferred from 2192 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_N_S`
## dada-class: object describing DADA2 denoising results
## 9 sequence variants were inferred from 1247 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_R_N`
## dada-class: object describing DADA2 denoising results
## 28 sequence variants were inferred from 1535 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 827 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_N_N`
## dada-class: object describing DADA2 denoising results
## 9 sequence variants were inferred from 1231 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_N_S`
## dada-class: object describing DADA2 denoising results
## 10 sequence variants were inferred from 1340 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_R_N`
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 1646 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_R_S`
## dada-class: object describing DADA2 denoising results
## 31 sequence variants were inferred from 1432 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_N_N`
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 413 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1114 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_R_N`
## dada-class: object describing DADA2 denoising results
## 16 sequence variants were inferred from 765 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_R_S`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1007 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $KIT_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 8 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $KIT_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 12 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WATER_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 14 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WATER_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 11 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WaterFG
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 12 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WaterFG2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 9 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
readRDS(file = "./16S/dadaRs.RDS")
## $`141_1_N_N`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1098 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_1_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 942 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_1_R_N`
## dada-class: object describing DADA2 denoising results
## 27 sequence variants were inferred from 1025 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_1_R_S`
## dada-class: object describing DADA2 denoising results
## 4 sequence variants were inferred from 805 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_N_N`
## dada-class: object describing DADA2 denoising results
## 12 sequence variants were inferred from 1283 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1436 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_R_N`
## dada-class: object describing DADA2 denoising results
## 38 sequence variants were inferred from 1654 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_2_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 788 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_N_N`
## dada-class: object describing DADA2 denoising results
## 13 sequence variants were inferred from 1078 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 930 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_R_N`
## dada-class: object describing DADA2 denoising results
## 46 sequence variants were inferred from 1556 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_5_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1099 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_N_N`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1305 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 819 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_R_N`
## dada-class: object describing DADA2 denoising results
## 49 sequence variants were inferred from 1452 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`141_7_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 846 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_N_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1719 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1404 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_R_N`
## dada-class: object describing DADA2 denoising results
## 89 sequence variants were inferred from 2973 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_10_R_S`
## dada-class: object describing DADA2 denoising results
## 4 sequence variants were inferred from 1029 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_N_N`
## dada-class: object describing DADA2 denoising results
## 11 sequence variants were inferred from 1465 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1378 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_R_N`
## dada-class: object describing DADA2 denoising results
## 48 sequence variants were inferred from 2014 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_5_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1018 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_N_N`
## dada-class: object describing DADA2 denoising results
## 16 sequence variants were inferred from 1381 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1248 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_R_N`
## dada-class: object describing DADA2 denoising results
## 66 sequence variants were inferred from 2277 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_6_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 985 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_N_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1101 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1243 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_R_N`
## dada-class: object describing DADA2 denoising results
## 42 sequence variants were inferred from 1458 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`717_9_R_S`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 967 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_N_N`
## dada-class: object describing DADA2 denoising results
## 35 sequence variants were inferred from 2014 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_N_S`
## dada-class: object describing DADA2 denoising results
## 7 sequence variants were inferred from 1700 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_R_N`
## dada-class: object describing DADA2 denoising results
## 57 sequence variants were inferred from 1887 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_10_R_S`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1189 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_N_N`
## dada-class: object describing DADA2 denoising results
## 32 sequence variants were inferred from 1741 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_N_S`
## dada-class: object describing DADA2 denoising results
## 9 sequence variants were inferred from 1301 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_R_N`
## dada-class: object describing DADA2 denoising results
## 25 sequence variants were inferred from 1524 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_4_R_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 870 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_N_N`
## dada-class: object describing DADA2 denoising results
## 6 sequence variants were inferred from 1345 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_N_S`
## dada-class: object describing DADA2 denoising results
## 8 sequence variants were inferred from 1375 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_R_N`
## dada-class: object describing DADA2 denoising results
## 20 sequence variants were inferred from 1701 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_8_R_S`
## dada-class: object describing DADA2 denoising results
## 29 sequence variants were inferred from 1599 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_N_N`
## dada-class: object describing DADA2 denoising results
## 22 sequence variants were inferred from 413 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_N_S`
## dada-class: object describing DADA2 denoising results
## 5 sequence variants were inferred from 1131 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_R_N`
## dada-class: object describing DADA2 denoising results
## 13 sequence variants were inferred from 734 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $`733_9_R_S`
## dada-class: object describing DADA2 denoising results
## 14 sequence variants were inferred from 1049 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $KIT_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 10 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $KIT_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 13 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WATER_NEG_CTRL
## dada-class: object describing DADA2 denoising results
## 3 sequence variants were inferred from 15 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WATER_NEG_CTRL2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 11 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WaterFG
## dada-class: object describing DADA2 denoising results
## 2 sequence variants were inferred from 11 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## 
## $WaterFG2
## dada-class: object describing DADA2 denoising results
## 1 sequence variants were inferred from 10 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

Merge paired reads

We now merge the forward and reverse reads together to obtain the full denoised sequences. Merging is performed by aligning the denoised forward reads with the reverse-complement of the corresponding denoised reverse reads, and then constructing the merged “contig” sequences. By default, merged sequences are only output if the forward and reverse reads overlap by at least 12 bases, and are identical to each other in the overlap region (but these conditions can be changed via function arguments).

mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
## 4781 paired-reads (in 12 unique pairings) successfully merged out of 4828 (in 23 pairings) input.
## 3520 paired-reads (in 13 unique pairings) successfully merged out of 3556 (in 18 pairings) input.
## 2408 paired-reads (in 29 unique pairings) successfully merged out of 2588 (in 130 pairings) input.
## 3128 paired-reads (in 5 unique pairings) successfully merged out of 3143 (in 7 pairings) input.
## 6641 paired-reads (in 19 unique pairings) successfully merged out of 6691 (in 32 pairings) input.
## 7104 paired-reads (in 12 unique pairings) successfully merged out of 7131 (in 18 pairings) input.
## 6338 paired-reads (in 41 unique pairings) successfully merged out of 6483 (in 121 pairings) input.
## 3436 paired-reads (in 5 unique pairings) successfully merged out of 3449 (in 9 pairings) input.
## 5584 paired-reads (in 21 unique pairings) successfully merged out of 5634 (in 34 pairings) input.
## 3588 paired-reads (in 8 unique pairings) successfully merged out of 3614 (in 14 pairings) input.
## 4073 paired-reads (in 50 unique pairings) successfully merged out of 4394 (in 228 pairings) input.
## 4940 paired-reads (in 7 unique pairings) successfully merged out of 4958 (in 11 pairings) input.
## 6505 paired-reads (in 18 unique pairings) successfully merged out of 6567 (in 26 pairings) input.
## 3461 paired-reads (in 15 unique pairings) successfully merged out of 3502 (in 26 pairings) input.
## 3512 paired-reads (in 54 unique pairings) successfully merged out of 3728 (in 187 pairings) input.
## 3905 paired-reads (in 4 unique pairings) successfully merged out of 3927 (in 8 pairings) input.
## 9280 paired-reads (in 30 unique pairings) successfully merged out of 9355 (in 61 pairings) input.
## 7454 paired-reads (in 11 unique pairings) successfully merged out of 7480 (in 19 pairings) input.
## 6282 paired-reads (in 98 unique pairings) successfully merged out of 6904 (in 411 pairings) input.
## 5924 paired-reads (in 6 unique pairings) successfully merged out of 5935 (in 11 pairings) input.
## 6875 paired-reads (in 20 unique pairings) successfully merged out of 6938 (in 39 pairings) input.
## 6684 paired-reads (in 9 unique pairings) successfully merged out of 6705 (in 16 pairings) input.
## 5440 paired-reads (in 69 unique pairings) successfully merged out of 5849 (in 247 pairings) input.
## 5181 paired-reads (in 6 unique pairings) successfully merged out of 5193 (in 10 pairings) input.
## 6622 paired-reads (in 22 unique pairings) successfully merged out of 6708 (in 45 pairings) input.
## 6121 paired-reads (in 7 unique pairings) successfully merged out of 6152 (in 14 pairings) input.
## 5606 paired-reads (in 75 unique pairings) successfully merged out of 6134 (in 314 pairings) input.
## 4393 paired-reads (in 6 unique pairings) successfully merged out of 4405 (in 10 pairings) input.
## 4479 paired-reads (in 27 unique pairings) successfully merged out of 4570 (in 71 pairings) input.
## 5703 paired-reads (in 7 unique pairings) successfully merged out of 5730 (in 13 pairings) input.
## 3981 paired-reads (in 45 unique pairings) successfully merged out of 4199 (in 171 pairings) input.
## 4018 paired-reads (in 5 unique pairings) successfully merged out of 4031 (in 10 pairings) input.
## 8966 paired-reads (in 63 unique pairings) successfully merged out of 9165 (in 136 pairings) input.
## 9049 paired-reads (in 12 unique pairings) successfully merged out of 9079 (in 20 pairings) input.
## 3958 paired-reads (in 100 unique pairings) successfully merged out of 4563 (in 373 pairings) input.
## 5653 paired-reads (in 14 unique pairings) successfully merged out of 5681 (in 21 pairings) input.
## 7059 paired-reads (in 57 unique pairings) successfully merged out of 7220 (in 131 pairings) input.
## 6774 paired-reads (in 10 unique pairings) successfully merged out of 6797 (in 16 pairings) input.
## 5922 paired-reads (in 45 unique pairings) successfully merged out of 6026 (in 90 pairings) input.
## 3246 paired-reads (in 5 unique pairings) successfully merged out of 3251 (in 8 pairings) input.
## 6855 paired-reads (in 10 unique pairings) successfully merged out of 6875 (in 17 pairings) input.
## 6963 paired-reads (in 11 unique pairings) successfully merged out of 6989 (in 22 pairings) input.
## 6171 paired-reads (in 36 unique pairings) successfully merged out of 6307 (in 79 pairings) input.
## 5034 paired-reads (in 37 unique pairings) successfully merged out of 5100 (in 77 pairings) input.
## 1052 paired-reads (in 24 unique pairings) successfully merged out of 1117 (in 63 pairings) input.
## 5456 paired-reads (in 5 unique pairings) successfully merged out of 5475 (in 11 pairings) input.
## 3100 paired-reads (in 27 unique pairings) successfully merged out of 3193 (in 55 pairings) input.
## 4083 paired-reads (in 14 unique pairings) successfully merged out of 4097 (in 21 pairings) input.
## 9 paired-reads (in 1 unique pairings) successfully merged out of 9 (in 1 pairings) input.
## 15 paired-reads (in 1 unique pairings) successfully merged out of 15 (in 1 pairings) input.
## 15 paired-reads (in 1 unique pairings) successfully merged out of 15 (in 1 pairings) input.
## 17 paired-reads (in 1 unique pairings) successfully merged out of 17 (in 1 pairings) input.
## 8 paired-reads (in 1 unique pairings) successfully merged out of 8 (in 1 pairings) input.
## 13 paired-reads (in 1 unique pairings) successfully merged out of 13 (in 1 pairings) input.
# Inspect the merger data.frame from the first sample
head(mergers[[1]])
##                                                                                                                                                                                                                                                                                                                                                                                    sequence
## 1     AATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTCACCGGTGAAGATAATGACGGTAACCGGAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGTTAAGTGAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTCATACTGGCAATCTAGAGTCCAGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCTGGAACTGACGCTGAGGTGCGAAAGC
## 2   AATGGGCGAAAGCCTGACGGAGCAATGCCGCGTGGAGGTAAAAGGCCTACGGGTCATGAACTTCTTTTCCTGGAGAAGAAACAATGACGGTATCCGGGGAATAAGCATCGGCTAACTCTGTGCCAGCAGCCGCGGTAAGACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTTAAGTTCGCCGTCAAATCCCAGGGCTCAACCCTGGACAGGTGGTGAAAACTACTAAGCTAGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCGTAGAGATCGGAAGGAACACCAATGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGC
## 3   AATGGGCGAAAGCCCGATCCAGCAATATCGCGTGAGTGAAGAAGGGCAATGCCGCTTGTAAAGCTCTTTCGTCGAGTGCGCGATCATGACAGGACTCGAGGAAGAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAAGACGGGGGGGGCAAGTGTTCTTCGGAATGACTGGGCGTAAAGGGCACGTAGGCGGTGAATCGGGTTGAAAGTTAAAGTCGCCAAAAACTGGTGGAATGCTCTCGAAACCAATTCACTTGAGTGAGACAGAGGAGAGTGGAATTTCGTGTGTAGGGGTGAAATCCGCAGATCTACGAAGGAACGCCAAAAGCGAAGGCAGCTCTCTGGGTCCCTACCGACGCTGGAGTGCGAAAGC
## 4 AATGGGCGAAAGCCCGATCCAGCAATATCGCGTGAGTGAAGAAGGGCAATGCCGCTTGTAAAGCTCTTTCGTCGAGTGCGCGATCATGACAGGACTCGAGGAAGAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGTTAAGTGAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTCATACTGGCAATCTAGAGTCCAGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCTGGAACTGACGCTGAGGTGCGAAAGC
## 5       AATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTCACCGGTGAAGATAATGACGGTAACCGGAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAAGACGGGGGGGGCAAGTGTTCTTCGGAATGACTGGGCGTAAAGGGCACGTAGGCGGTGAATCGGGTTGAAAGTTAAAGTCGCCAAAAACTGGTGGAATGCTCTCGAAACCAATTCACTTGAGTGAGACAGAGGAGAGTGGAATTTCGTGTGTAGGGGTGAAATCCGCAGATCTACGAAGGAACGCCAAAAGCGAAGGCAGCTCTCTGGGTCCCTACCGACGCTGGAGTGCGAAAGC
## 7   AATGGGCGAAAGCCTGACGGAGCAATGCCGCGTGGAGGTAAAAGGCCTACGGGTCATGAACTTCTTTTCCTGGAGAAGAAACAATGACGGTATCCGGGGAATAAGCATCGGCTAACTCTGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGTTAAGTGAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTCATACTGGCAATCTAGAGTCCAGAAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTCACTGGTCTGGAACTGACGCTGAGGTGCGAAAGC
##   abundance forward reverse nmatch nmismatch nindel prefer accept
## 1      3236       1       1     60         0      0      1   TRUE
## 2       919       2       2     58         0      0      1   TRUE
## 3       497       3       3     58         0      0      1   TRUE
## 4        36       4       1     56         0      0      2   TRUE
## 5        27       6       3     62         0      0      2   TRUE
## 7        17       7       1     58         0      0      2   TRUE

The mergers object is a list of data.frames from each sample. Each data.frame contains the merged sequence, its abundance, and the indices of the forward and reverse sequence variants that were merged. Paired reads that did not exactly overlap were removed by mergePairs, further reducing spurious output.

Construct sequence table

We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.

seqtab <- makeSequenceTable(mergers)
dim(seqtab)
## [1]  54 525
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
## 
## 371 372 373 374 375 376 377 378 389 390 391 392 393 394 395 396 397 398 399 
##   6   1 200   4  60   1  17   4   1   1   1   4  11   1   7   3   4 195   4

The sequence table is a matrix with rows corresponding to (and named by) the samples, and columns corresponding to (and named by) the sequence variants.

Remove chimeras

The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.

seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
## Identified 277 bimeras out of 525 input sequences.
dim(seqtab.nochim)
## [1]  54 248
sum(seqtab.nochim)/sum(seqtab)
## [1] 0.9802173
write.csv(seqtab.nochim, "./16S/sequence_table.csv")

Here chimeras make up about 2% of the merged sequence variants.

Track reads through the pipeline

As a final check of our progress, we’ll look at the number of reads that made it through each step in the pipeline:

getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
track
##                 input filtered denoisedF denoisedR merged nonchim
## 141_1_N_N        5473     4890      4861      4848   4781    4663
## 141_1_N_S        3995     3605      3579      3580   3520    3377
## 141_1_R_N        3115     2767      2645      2648   2408    2372
## 141_1_R_S        3535     3159      3153      3146   3128    3095
## 141_2_N_N        7526     6747      6716      6710   6641    6476
## 141_2_N_S        8069     7176      7157      7145   7104    7036
## 141_2_R_N        7526     6677      6540      6580   6338    6267
## 141_2_R_S        3975     3473      3464      3453   3436    3436
## 141_5_N_N        6377     5680      5655      5651   5584    5408
## 141_5_N_S        4097     3651      3620      3639   3588    3579
## 141_5_R_N        5271     4688      4497      4538   4073    3950
## 141_5_R_S        5682     4987      4973      4962   4940    4926
## 141_7_N_N        7335     6646      6601      6599   6505    6318
## 141_7_N_S        3881     3522      3517      3506   3461    3344
## 141_7_R_N        4423     3943      3794      3841   3512    3444
## 141_7_R_S        4497     3954      3932      3942   3905    3905
## 717_10_N_N      10464     9461      9393      9400   9280    9091
## 717_10_N_S       8360     7529      7515      7486   7454    7352
## 717_10_R_N       8229     7225      7023      7046   6282    6217
## 717_10_R_S       6610     5949      5943      5935   5924    5920
## 717_5_N_N        7726     7020      6995      6950   6875    6754
## 717_5_N_S        7510     6754      6727      6727   6684    6648
## 717_5_R_N        6888     6122      5926      6002   5440    5210
## 717_5_R_S        5998     5234      5216      5199   5181    5172
## 717_6_N_N        7647     6792      6762      6731   6622    6486
## 717_6_N_S        6983     6198      6181      6167   6121    6106
## 717_6_R_N        7250     6418      6207      6303   5606    5546
## 717_6_R_S        5063     4429      4414      4411   4393    4393
## 717_9_N_N        5144     4665      4610      4613   4479    4386
## 717_9_N_S        6497     5781      5743      5761   5703    5695
## 717_9_R_N        4916     4360      4249      4276   3981    3919
## 717_9_R_S        4581     4059      4037      4047   4018    4018
## 733_10_N_N      10293     9289      9212      9230   8966    8464
## 733_10_N_S      10237     9129      9110      9088   9049    8981
## 733_10_R_N       5429     4812      4678      4663   3958    3352
## 733_10_R_S       6413     5743      5701      5700   5653    5636
## 733_4_N_N        8426     7434      7281      7345   7059    6601
## 733_4_N_S        7478     6840      6814      6815   6774    6762
## 733_4_R_N        7036     6173      6094      6096   5922    5572
## 733_4_R_S        3665     3274      3260      3255   3246    3246
## 733_8_N_N        7841     6912      6888      6884   6855    6797
## 733_8_N_S        7764     7038      7007      7007   6963    6905
## 733_8_R_N        7260     6448      6363      6370   6171    5950
## 733_8_R_S        6018     5235      5159      5149   5034    4932
## 733_9_N_N        1366     1214      1150      1156   1052    1038
## 733_9_N_S        6092     5519      5487      5498   5456    5450
## 733_9_R_N        3648     3270      3225      3234   3100    2971
## 733_9_R_S        4583     4132      4107      4108   4083    4070
## KIT_NEG_CTRL       24       12         9         9      9       9
## KIT_NEG_CTRL2      35       20        16        15     15      15
## WATER_NEG_CTRL     35       22        15        22     15      15
## WATER_NEG_CTRL2    48       18        17        17     17      17
## WaterFG            27       16         8        11      8       8
## WaterFG2           28       16        13        13     13      13

Looks good! We kept the majority of our raw reads, and there is no over-large drop associated with any single step after filtering.

Assign taxonomy

It is common at this point, especially in 16S/18S/ITS amplicon sequencing, to assign taxonomy to the sequence variants. The DADA2 package provides a native implementation of the naive Bayesian classifier method for this purpose. The assignTaxonomy function takes as input a set of sequences to be classified and a training set of reference sequences with known taxonomy, and outputs taxonomic assignments with at least minBoot bootstrap confidence.

The recently developed IdTaxa taxonomic classification method is also available via the DECIPHER Bioconductor package. The paper introducing the IDTAXA algorithm reports classification performance that is better than the long-time standard set by the naive Bayesian classifier. Here we include a code block that allows you to use IdTaxa as a drop-in replacement for assignTaxonomy (and it’s faster as well!). Trained classifiers are available from http://DECIPHER.codes/Downloads.html.

dna <- DNAStringSet(getSequences(seqtab.nochim)) # Create a DNAStringSet from the ASVs
load("./16S/SILVA_SSU_r138_2019.RData")
ids <- IdTaxa(dna, trainingSet, strand="both", processors=NULL, verbose=FALSE) # use all processors
ranks <- c("domain", "phylum", "class", "order", "family", "genus", "species") # ranks of interest
# Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
taxid <- t(sapply(ids, function(x) {
        m <- match(ranks, x$rank)
        taxa <- x$taxon[m]
        taxa[startsWith(taxa, "unclassified_")] <- NA
        taxa
}))
colnames(taxid) <- ranks; rownames(taxid) <- getSequences(seqtab.nochim)
colnames(taxid) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
taxid[taxid == "Ensifer"] <- "Sinorhizobium"
taxa <- taxid

Handoff to phyloseq

The phyloseq R package is a powerful framework for further analysis of microbiome data.

Import into phyloseq:

ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               tax_table(taxa))

It is more convenient to use short names for our ASVs (e.g. ASV21) rather than the full DNA sequence when working with some of the tables and visualizations from phyloseq, but we want to keep the full DNA sequences for other purposes like merging with other datasets or indexing into reference databases like the Earth Microbiome Project. For that reason we’ll store the DNA sequences of our ASVs in the refseq slot of the phyloseq object, and then rename our taxa to a short string. That way, the short new taxa names will appear in tables and plots, and we can still recover the DNA sequences corresponding to each ASV as needed with refseq(ps).

dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
write.csv(otu_table(ps), "./16S/asv_table.csv")
writeXStringSet(refseq(ps), "./16S/ASV.fasta")
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 248 taxa and 54 samples ]
## tax_table()   Taxonomy Table:    [ 248 taxa by 7 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 248 reference sequences ]

Modify dataframe with sample categories

Using the sample_data function, I can add a dataframe (here, a csv I uploaded named ‘metadata’) of non-numerical variables to categorize my data even further. To do this, I need to make sure that the rownames are the exact same as the rownames in the otu_table for the phyloseq (ps) object I am adding it to.

Upload and create modified dataframe with categories:

seqtab.sample_data <- read.csv("./16S_metadata.csv")
seqtab.sample_data2 <- seqtab.sample_data[,-1]
rownames(seqtab.sample_data2) <- seqtab.sample_data[,1]

Now add it to my phyloseq object:

sample_data(ps) <- seqtab.sample_data2

Remove controls from dataset:

ps <- prune_samples(sample_names(ps) != "KIT_NEG_CTRL", ps)
ps <- prune_samples(sample_names(ps) != "KIT_NEG_CTRL2", ps)
ps <- prune_samples(sample_names(ps) != "WATER_NEG_CTRL", ps)
ps <- prune_samples(sample_names(ps) != "WATER_NEG_CTRL2", ps)
ps <- prune_samples(sample_names(ps) != "WaterFG", ps)
ps <- prune_samples(sample_names(ps) != "WaterFG2", ps)

Additionally, there was an error made in labeling two samples. Sample ID “733_8_N_N” in this dataset actually contains data for 733_8_R_S, and sample ID “733-733_8_R_S-R-S” contains data for 733_8_N_N-8-N-N. I.e. the two samples were switched when dispensing DNA into wells before submission. So I wil lwhich the contents of these rows in the sample_data for the ps object:

# Run only ONCE!
row1_index <- "733_8_N_N"
row2_index <- "733_8_R_S"
row1_data <- sample_data(ps)[row1_index, ]
row2_data <- sample_data(ps)[row2_index, ]
sample_data(ps)[row1_index, ] <- row2_data
sample_data(ps)[row2_index, ] <- row1_data
sample_data(ps)[row1_index, ]
##             Inoculum Tissue            Surface                    Group
## 733_8_N_N 733B + 141   Root Surface-sterilized Root, surface-sterilized
sample_data(ps)[row2_index, ]
##             Inoculum Tissue                Surface
## 733_8_R_S 733B + 141 Nodule Not surface-sterilized
##                                    Group
## 733_8_R_S Nodule, not surface-sterilized

Remove mitochondria and chloroplast families as well as other misc outliers

ps.organelle.rm <- ps
ps.organelle.rm <- subset_taxa(ps.organelle.rm, Order!="Rickettsiales")
ps.organelle.rm <- subset_taxa(ps.organelle.rm, Order!="Chloroplast")
ps.organelle.rm <- subset_taxa(ps.organelle.rm, is.na(Domain) == FALSE)

Differential abundance with DESeq2

Gather abundance data

ps.organelle.rm.newnames <- ps.organelle.rm
taxa_names(ps.organelle.rm.newnames) <- paste0(taxa_names(ps.organelle.rm.newnames), " (", data.frame(tax_table(ps.organelle.rm.newnames))$Family, "; ", data.frame(tax_table(ps.organelle.rm.newnames))$Genus, ")")
counts <- otu_table(ps.organelle.rm.newnames)
metadata <- sample_data(ps.organelle.rm.newnames)
metadata <- data.frame(metadata)
metadata <- subset(metadata, select = -c(Group))
metadata$Tissue <- factor(metadata$Tissue, levels = c("Root", "Nodule"))
metadata$Surface <- factor(metadata$Surface, levels = c("Not surface-sterilized", "Surface-sterilized"))
metadata$Inoculum <- factor(metadata$Inoculum, levels = c("141", "717A + 141", "733B + 141"))
sampleorder <- row.names(metadata[order(metadata$Inoculum, metadata$Tissue, metadata$Surface), ]) # Set sample order
metadata <- metadata[sampleorder, ] # Reorder metadata table by sample order
counts <- counts[sampleorder ,] # Reorder matrix by sample order
counts <- t(counts) # Transpose matrix to flip rows and columns
counts <- data.frame(counts) # Convert from OTU table object to dataframe
colnames(counts) <-  sub("X", "", colnames(counts)) # Remove "X" from beginning of column names that appeared for some reason
head(counts)
##                                                141_1_R_N 141_2_R_N 141_5_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                     0       427        70
## ASV4 (Comamonadaceae; NA)                              0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                              258        49       291
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                42       114        57
##                                                141_7_R_N 141_1_R_S 141_2_R_S
## ASV1 (Rhizobiaceae; Sinorhizobium)                    15         0        14
## ASV4 (Comamonadaceae; NA)                              0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                              285         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                91         0         0
##                                                141_5_R_S 141_7_R_S 141_1_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                    12         9      3236
## ASV4 (Comamonadaceae; NA)                              0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                                0         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                 0         0         0
##                                                141_2_N_N 141_5_N_N 141_7_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                  4928      4169      4446
## ASV4 (Comamonadaceae; NA)                              0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                                7         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                 0         0         0
##                                                141_1_N_S 141_2_N_S 141_5_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium)                  1662      4644      1391
## ASV4 (Comamonadaceae; NA)                              0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                                0         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                 0         0         0
##                                                141_7_N_S 717_10_R_N 717_5_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                  1650         84         0
## ASV4 (Comamonadaceae; NA)                              0          0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0          0         0
## ASV6 (Rhizobiaceae; NA)                                0        462        35
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0        894        11
## ASV8 (Methylophilaceae; Methylotenera)                 0        155       247
##                                                717_6_R_N 717_9_R_N 717_10_R_S
## ASV1 (Rhizobiaceae; Sinorhizobium)                    11        17       5576
## ASV4 (Comamonadaceae; NA)                              0         0          0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0          0
## ASV6 (Rhizobiaceae; NA)                               86        88          0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)       242        97          0
## ASV8 (Methylophilaceae; Methylotenera)                78        45          0
##                                                717_5_R_S 717_6_R_S 717_9_R_S
## ASV1 (Rhizobiaceae; Sinorhizobium)                    20        31        20
## ASV4 (Comamonadaceae; NA)                              0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                                0         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                 0         0         0
##                                                717_10_N_N 717_5_N_N 717_6_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                   7170      4285      3920
## ASV4 (Comamonadaceae; NA)                               0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                    0         0         0
## ASV6 (Rhizobiaceae; NA)                                31         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         17         0         0
## ASV8 (Methylophilaceae; Methylotenera)                  0        38         0
##                                                717_9_N_N 717_10_N_S 717_5_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium)                  2317       5666      4131
## ASV4 (Comamonadaceae; NA)                              0          0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0          0         0
## ASV6 (Rhizobiaceae; NA)                               17          0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0          0         0
## ASV8 (Methylophilaceae; Methylotenera)                50          0         0
##                                                717_6_N_S 717_9_N_S 733_10_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                  4110      2935          0
## ASV4 (Comamonadaceae; NA)                              0         0         18
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0          0
## ASV6 (Rhizobiaceae; NA)                                0         0        157
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0          0
## ASV8 (Methylophilaceae; Methylotenera)                 0         0         20
##                                                733_4_R_N 733_8_R_N 733_9_R_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                  2311      3225      1809
## ASV4 (Comamonadaceae; NA)                            785       637       251
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                                0         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                21        10         5
##                                                733_10_R_S 733_4_R_S 733_8_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                     15       502        24
## ASV4 (Comamonadaceae; NA)                               0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                    0         0         0
## ASV6 (Rhizobiaceae; NA)                                 0         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)          0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                 11         0         0
##                                                733_9_R_S 733_10_N_N 733_4_N_N
## ASV1 (Rhizobiaceae; Sinorhizobium)                    68       4174         9
## ASV4 (Comamonadaceae; NA)                              0        431       452
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0        140      2384
## ASV6 (Rhizobiaceae; NA)                                0          0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0          0         0
## ASV8 (Methylophilaceae; Methylotenera)                 0         39        12
##                                                733_8_R_S 733_9_N_N 733_10_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium)                    12         0       5276
## ASV4 (Comamonadaceae; NA)                           1251        47          0
## ASV5 (Pseudomonadaceae; Pseudomonas)                  13       139          0
## ASV6 (Rhizobiaceae; NA)                                0        12          0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0          0
## ASV8 (Methylophilaceae; Methylotenera)               202        15          0
##                                                733_4_N_S 733_8_N_S 733_9_N_S
## ASV1 (Rhizobiaceae; Sinorhizobium)                  4935      4877      3928
## ASV4 (Comamonadaceae; NA)                              0         0         0
## ASV5 (Pseudomonadaceae; Pseudomonas)                   0         0         0
## ASV6 (Rhizobiaceae; NA)                                0         0         0
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)         0         0         0
## ASV8 (Methylophilaceae; Methylotenera)                 0         0         0

Count distribution

I’ll plot the distributions of a few samples:

ggplot(counts) +
  geom_histogram(aes(x = `141_1_N_N`), stat = "bin", bins = 20) +
  xlab("ASV abundance") +
  ylab("Number of ASVs") +
  ggtitle("141_1_N_N")

ggplot(counts) +
  geom_histogram(aes(x = `717_5_R_S`), stat = "bin", bins = 20) +
  xlab("ASV abundance") +
  ylab("Number of ASVs") +
  ggtitle("717_5_R_S")

ggplot(counts) +
  geom_histogram(aes(x = `733_9_N_S`), stat = "bin", bins = 20) +
  xlab("ASV abundance") +
  ylab("Number of ASVs") +
  ggtitle("733_9_N_S")

These samples are left-skewed. The tutorial shows a very similar pattern and states, “This plot illustrates some common features of RNA-seq count data: a low number of counts associated with a large proportion of genes, a long right tail due to the lack of any upper limit for expression, large dynamic range.” So I guess this is a good sign that this is what the typical input data for DESeq looks like at this stage.

Creating a DESeq2 object and normalizing counts

To create a DESeq object from the counts, I need to make sure that the order of column names in the counts table is the same as the order of rownames for the metadata.

# reorder metadata's columns based on row order of the counts  
metadata <- metadata[colnames(counts), ]
all(colnames(counts) == rownames(metadata))
## [1] TRUE

Make DESeq object

# Convert 0 values to ones
counts[counts==0] <- 1 # Convert NA's to pseudo-count 1's for rld

dds <- DESeqDataSetFromMatrix(countData=counts,
                              colData=metadata,
                              design= ~Inoculum + Tissue + Surface)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
dds
## class: DESeqDataSet 
## dim: 204 48 
## metadata(1): version
## assays(1): counts
## rownames(204): ASV1 (Rhizobiaceae; Sinorhizobium) ASV4 (Comamonadaceae;
##   NA) ... ASV246 (NA; NA) ASV248 (Comamonadaceae; NA)
## rowData names(0):
## colnames(48): 141_1_R_N 141_2_R_N ... 733_8_N_S 733_9_N_S
## colData names(3): Inoculum Tissue Surface

Once the DESeq() function is used later on, it will use the function estimateSizeFactors() to generate normalized counts, so I do not have to do it manually here.

Quality control

To explore the similarity of my samples, I will perform sample-level QC using Principal Component Analysis (PCA) and hierarchical clustering methods. The sample-level QC allows me to see how well the replicates cluster together, as well as, observe whether the experimental condition represents the major source of variation in the data. Performing sample-level QC can also identify any sample outliers, which may need to be explored further to determine whether they need to be removed prior to DE analysis.

Make a DESeq object just for sample-level QC

dds_QC <- DESeq(dds, test="LRT", reduced=~1)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing

Remove low counts before QC

dds_filt <- DGEList(counts=counts(dds_QC), group=metadata$Inoculum:metadata$Tissue:metadata$Surface)
keep <- filterByExpr(dds_filt)
table(keep)
## keep
## TRUE 
##  204

None of them need to be removed.

Hypothesis testing

Hypothesis 1

Inoculum does not significantly affect ASV abundances.

Full model: design = ~ Inoculum + Tissue + Surface.

Reduced model: design = ~ Tissue + Surface

Test

dds_lrt_full.vs.noInoc <- DESeq(dds, test="LRT", reduced = ~ Tissue + Surface)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing

Results

res_LRT_full.vs.noInoc <- results(dds_lrt_full.vs.noInoc)

# Create a tibble for LRT results
res_LRT_full.vs.noInoc_tb <- res_LRT_full.vs.noInoc %>%
  data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_LRT_full.vs.noInoc_tb, "./16S/LRT_test_results_Inoc.csv", row.names = F)

# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_full.vs.noInoc <- res_LRT_full.vs.noInoc_tb %>%
  filter(padj < padj.cutoff)
insigLRT_ASVs_full.vs.noInoc <- res_LRT_full.vs.noInoc_tb %>%
  filter(padj >= padj.cutoff)

Get the significant ASVs

sigLRT_ASVs_full.vs.noInoc
## # A tibble: 37 × 7
##    ASV                     baseMean log2FoldChange lfcSE  stat   pvalue     padj
##    <chr>                      <dbl>          <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV4 (Comamonadaceae; …    85.5           -2.70 0.573 60.8  6.29e-14 4.84e-12
##  2 ASV5 (Pseudomonadaceae…    59.4           -2.35 0.635 28.7  5.91e- 7 9.10e- 6
##  3 ASV7 (Comamonadaceae; …    27.9           -2.38 0.581 30.8  2.02e- 7 5.19e- 6
##  4 ASV9 (Rhizobiaceae; NA)    23.3           -3.29 0.563 18.7  8.90e- 5 5.61e- 4
##  5 ASV14 (Sphingomonadace…    14.3           -3.61 0.517 17.2  1.88e- 4 9.64e- 4
##  6 ASV15 (Azospirillaceae…    12.8           -2.10 0.565 27.4  1.12e- 6 1.23e- 5
##  7 ASV17 (Rhizobiaceae; A…    10.7           -2.45 0.600  9.63 8.10e- 3 2.01e- 2
##  8 ASV18 (Sphingomonadace…    10.4           -1.94 0.581 21.6  2.03e- 5 1.95e- 4
##  9 ASV20 (Sphingomonadace…     9.18          -2.62 0.514 14.8  6.23e- 4 2.67e- 3
## 10 ASV21 (Devosiaceae; De…     9.01          -2.83 0.523 11.0  4.10e- 3 1.22e- 2
## # ℹ 27 more rows
nrow(sigLRT_ASVs_full.vs.noInoc)
## [1] 37

Get number of insignificant ASVs

insigLRT_ASVs_full.vs.noInoc
## # A tibble: 40 × 7
##    ASV                         baseMean log2FoldChange lfcSE   stat pvalue  padj
##    <chr>                          <dbl>          <dbl> <dbl>  <dbl>  <dbl> <dbl>
##  1 ASV1 (Rhizobiaceae; Sinorh…  2154.           -0.359 0.687 3.29    0.193 0.232
##  2 ASV6 (Rhizobiaceae; NA)        39.2          -4.89  0.563 3.47    0.176 0.230
##  3 ASV8 (Methylophilaceae; Me…    27.8          -5.11  0.535 2.86    0.239 0.282
##  4 ASV11 (Azospirillaceae; Az…    21.8          -4.80  0.574 2.37    0.306 0.337
##  5 ASV16 (Caulobacteraceae; P…    11.6          -3.71  0.509 1.44    0.486 0.519
##  6 ASV19 (Sphingomonadaceae; …     9.25         -3.25  0.508 1.51    0.470 0.509
##  7 ASV22 (Rhodocyclaceae; Dec…     8.86         -3.11  0.513 0.0723  0.965 0.965
##  8 ASV25 (Enterobacteriaceae;…     5.84          2.68  0.503 3.71    0.157 0.219
##  9 ASV29 (Rhodobacteraceae; N…     5.34         -2.95  0.511 0.0705  0.965 0.965
## 10 ASV33 (Microscillaceae; Oh…     4.96         -2.22  0.544 4.47    0.107 0.165
## # ℹ 30 more rows
nrow(insigLRT_ASVs_full.vs.noInoc)
## [1] 40

Contrasts: inoculum 717A + 141 vs 141

Get results

res_inoc_717A_vs_141 <- results(dds_lrt_full.vs.noInoc, contrast = c("Inoculum", "717A + 141", "141"), alpha = 0.05) # Baseline is 141 inoculum
head(res_inoc_717A_vs_141  %>% as.data.frame())
##                                                  baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.84652    1.648332850
## ASV4 (Comamonadaceae; NA)                        85.46005   -0.004799983
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.39209   -0.004441811
## ASV6 (Rhizobiaceae; NA)                          39.20911    0.039710119
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.92732    4.195912406
## ASV8 (Methylophilaceae; Methylotenera)           27.80852    0.831357561
##                                                    lfcSE      stat       pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium)             0.8419657  3.294766 1.925532e-01
## ASV4 (Comamonadaceae; NA)                      0.7565939 60.794638 6.289443e-14
## ASV5 (Pseudomonadaceae; Pseudomonas)           0.8232654 28.683774 5.907416e-07
## ASV6 (Rhizobiaceae; NA)                        0.6547935  3.470635 1.763442e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.6960142 30.829545 2.020456e-07
## ASV8 (Methylophilaceae; Methylotenera)         0.6416827  2.864414 2.387813e-01
##                                                        padj
## ASV1 (Rhizobiaceae; Sinorhizobium)             2.709734e-01
## ASV4 (Comamonadaceae; NA)                      5.660499e-12
## ASV5 (Pseudomonadaceae; Pseudomonas)           1.063335e-05
## ASV6 (Rhizobiaceae; NA)                        2.689997e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 6.061368e-06
## ASV8 (Methylophilaceae; Methylotenera)         3.203821e-01

Shrunken log2 foldchanges (LFC)

# Apply fold change shrinkage
res_inoc_717A_vs_141 <- lfcShrink(dds_lrt_full.vs.noInoc, coef="Inoculum_717A...141_vs_141")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_inoc_717A_vs_141
## log2 fold change (MAP): Inoculum 717A...141 vs 141 
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Tissue + Surface' 
## DataFrame with 204 rows and 5 columns
##                                                 baseMean log2FoldChange
##                                                <numeric>      <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.8465     0.52267506
## ASV4 (Comamonadaceae; NA)                        85.4601    -0.00114549
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.3921    -0.00441186
## ASV6 (Rhizobiaceae; NA)                          39.2091     0.01682279
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.9273     3.81775075
## ...                                                  ...            ...
## ASV243 (NA; NA)                                  1.05099    -0.00352642
## ASV244 (NA; NA)                                  1.05099    -0.00352642
## ASV245 (NA; NA)                                  1.05099    -0.00352642
## ASV246 (NA; NA)                                  1.05099    -0.00352642
## ASV248 (Comamonadaceae; NA)                      1.05099    -0.00352642
##                                                    lfcSE      pvalue
##                                                <numeric>   <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)              0.726192 1.92553e-01
## ASV4 (Comamonadaceae; NA)                       0.472213 6.28944e-14
## ASV5 (Pseudomonadaceae; Pseudomonas)            0.489396 5.90742e-07
## ASV6 (Rhizobiaceae; NA)                         0.433545 1.76344e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)  0.925550 2.02046e-07
## ...                                                  ...         ...
## ASV243 (NA; NA)                                 0.390332    0.999962
## ASV244 (NA; NA)                                 0.390332    0.999962
## ASV245 (NA; NA)                                 0.390332    0.999962
## ASV246 (NA; NA)                                 0.390332    0.999962
## ASV248 (Comamonadaceae; NA)                     0.390332    0.999962
##                                                       padj
##                                                  <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2.31833e-01
## ASV4 (Comamonadaceae; NA)                      4.84287e-12
## ASV5 (Pseudomonadaceae; Pseudomonas)           9.09742e-06
## ASV6 (Rhizobiaceae; NA)                        2.30144e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 5.18584e-06
## ...                                                    ...
## ASV243 (NA; NA)                                         NA
## ASV244 (NA; NA)                                         NA
## ASV245 (NA; NA)                                         NA
## ASV246 (NA; NA)                                         NA
## ASV248 (Comamonadaceae; NA)                             NA

Summarize results

summary(res_inoc_717A_vs_141, alpha = 0.05)
## 
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 27, 13%
## LFC < 0 (down)     : 10, 4.9%
## outliers [1]       : 20, 9.8%
## low counts [2]     : 107, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Save results

res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141 %>%
  as.data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_inoc_717A_vs_141_tb, "./16S/test_results_Inoc_717A141_vs_141_shrunken_LFC.csv", row.names = F)
head(res_inoc_717A_vs_141_tb)
## # A tibble: 6 × 6
##   ASV                            baseMean log2FoldChange lfcSE   pvalue     padj
##   <chr>                             <dbl>          <dbl> <dbl>    <dbl>    <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo…   2154.         0.523   0.726 1.93e- 1 2.32e- 1
## 2 ASV4 (Comamonadaceae; NA)          85.5       -0.00115 0.472 6.29e-14 4.84e-12
## 3 ASV5 (Pseudomonadaceae; Pseud…     59.4       -0.00441 0.489 5.91e- 7 9.10e- 6
## 4 ASV6 (Rhizobiaceae; NA)            39.2        0.0168  0.434 1.76e- 1 2.30e- 1
## 5 ASV7 (Comamonadaceae; Candida…     27.9        3.82    0.926 2.02e- 7 5.19e- 6
## 6 ASV8 (Methylophilaceae; Methy…     27.8        0.403   0.514 2.39e- 1 2.82e- 1

Extract significant results

sig_inoc_717A_vs_141 <- res_inoc_717A_vs_141_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_717A_vs_141, "./16S/significant_results_Inoc_717A141_vs_141_shrunken_LFC.csv", row.names = F)
sig_inoc_717A_vs_141
## # A tibble: 37 × 6
##    ASV                           baseMean log2FoldChange lfcSE   pvalue     padj
##    <chr>                            <dbl>          <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV4 (Comamonadaceae; NA)        85.5        -0.00115 0.472 6.29e-14 4.84e-12
##  2 ASV5 (Pseudomonadaceae; Pseu…    59.4        -0.00441 0.489 5.91e- 7 9.10e- 6
##  3 ASV7 (Comamonadaceae; Candid…    27.9         3.82    0.926 2.02e- 7 5.19e- 6
##  4 ASV9 (Rhizobiaceae; NA)          23.3         3.06    0.897 8.90e- 5 5.61e- 4
##  5 ASV14 (Sphingomonadaceae; No…    14.3         2.88    0.818 1.88e- 4 9.64e- 4
##  6 ASV15 (Azospirillaceae; Azos…    12.8         3.17    0.839 1.12e- 6 1.23e- 5
##  7 ASV17 (Rhizobiaceae; Allorhi…    10.7         1.89    0.999 8.10e- 3 2.01e- 2
##  8 ASV18 (Sphingomonadaceae; NA)    10.4         2.80    0.878 2.03e- 5 1.95e- 4
##  9 ASV20 (Sphingomonadaceae; No…     9.18       -0.950   0.620 6.23e- 4 2.67e- 3
## 10 ASV21 (Devosiaceae; Devosia)      9.01        2.04    0.815 4.10e- 3 1.22e- 2
## # ℹ 27 more rows
nrow(sig_inoc_717A_vs_141)
## [1] 37

Contrasts: inoculum 733B + 141 vs 141

Of the ASVs whose abundances are significantly affected by inoculum, which ones are differentially abundant in the 733B + 141 inoculua compared to the baseline 141 inoculum?

Get results

res_inoc_733B_vs_141 <- results(dds_lrt_full.vs.noInoc, contrast = c("Inoculum", "733B + 141", "141"), alpha = 0.05) # Baseline is 141 inoculum
head(res_inoc_733B_vs_141  %>% as.data.frame())
##                                                  baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.84652    1.594262191
## ASV4 (Comamonadaceae; NA)                        85.46005    6.548528151
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.39209    4.884055747
## ASV6 (Rhizobiaceae; NA)                          39.20911   -1.266132353
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.92732   -0.003228623
## ASV8 (Methylophilaceae; Methylotenera)           27.80852    1.151624816
##                                                    lfcSE      stat       pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium)             0.8419685  3.294766 1.925532e-01
## ASV4 (Comamonadaceae; NA)                      0.6801262 60.794638 6.289443e-14
## ASV5 (Pseudomonadaceae; Pseudomonas)           0.7632069 28.683774 5.907416e-07
## ASV6 (Rhizobiaceae; NA)                        0.6691364  3.470635 1.763442e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.7624188 30.829545 2.020456e-07
## ASV8 (Methylophilaceae; Methylotenera)         0.6374028  2.864414 2.387813e-01
##                                                        padj
## ASV1 (Rhizobiaceae; Sinorhizobium)             2.709734e-01
## ASV4 (Comamonadaceae; NA)                      5.660499e-12
## ASV5 (Pseudomonadaceae; Pseudomonas)           1.063335e-05
## ASV6 (Rhizobiaceae; NA)                        2.689997e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 6.061368e-06
## ASV8 (Methylophilaceae; Methylotenera)         3.203821e-01

Shrunken log2 foldchanges (LFC)

# Apply fold change shrinkage
res_inoc_733B_vs_141 <- lfcShrink(dds_lrt_full.vs.noInoc, coef="Inoculum_733B...141_vs_141")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_inoc_733B_vs_141
## log2 fold change (MAP): Inoculum 733B...141 vs 141 
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Tissue + Surface' 
## DataFrame with 204 rows and 5 columns
##                                                 baseMean log2FoldChange
##                                                <numeric>      <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.8465     0.37234868
## ASV4 (Comamonadaceae; NA)                        85.4601     6.32162328
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.3921     4.36706306
## ASV6 (Rhizobiaceae; NA)                          39.2091    -0.43785604
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.9273    -0.00355456
## ...                                                  ...            ...
## ASV243 (NA; NA)                                  1.05099   -0.000570683
## ASV244 (NA; NA)                                  1.05099   -0.000570683
## ASV245 (NA; NA)                                  1.05099   -0.000570683
## ASV246 (NA; NA)                                  1.05099   -0.000570683
## ASV248 (Comamonadaceae; NA)                      1.05099   -0.000570683
##                                                    lfcSE      pvalue
##                                                <numeric>   <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)              0.594031 1.92553e-01
## ASV4 (Comamonadaceae; NA)                       0.875980 6.28944e-14
## ASV5 (Pseudomonadaceae; Pseudomonas)            1.149829 5.90742e-07
## ASV6 (Rhizobiaceae; NA)                         0.581588 1.76344e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)  0.405339 2.02046e-07
## ...                                                  ...         ...
## ASV243 (NA; NA)                                 0.347941    0.999962
## ASV244 (NA; NA)                                 0.347941    0.999962
## ASV245 (NA; NA)                                 0.347941    0.999962
## ASV246 (NA; NA)                                 0.347941    0.999962
## ASV248 (Comamonadaceae; NA)                     0.347941    0.999962
##                                                       padj
##                                                  <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2.31833e-01
## ASV4 (Comamonadaceae; NA)                      4.84287e-12
## ASV5 (Pseudomonadaceae; Pseudomonas)           9.09742e-06
## ASV6 (Rhizobiaceae; NA)                        2.30144e-01
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 5.18584e-06
## ...                                                    ...
## ASV243 (NA; NA)                                         NA
## ASV244 (NA; NA)                                         NA
## ASV245 (NA; NA)                                         NA
## ASV246 (NA; NA)                                         NA
## ASV248 (Comamonadaceae; NA)                             NA

Summarize results

summary(res_inoc_733B_vs_141, alpha = 0.05)
## 
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 12, 5.9%
## LFC < 0 (down)     : 25, 12%
## outliers [1]       : 20, 9.8%
## low counts [2]     : 107, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Save results

res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141 %>%
  as.data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_inoc_733B_vs_141_tb, "./16S/test_results_Inoc_733B141_vs_141_shrunken_LFC.csv", row.names = F)
head(res_inoc_733B_vs_141_tb)
## # A tibble: 6 × 6
##   ASV                            baseMean log2FoldChange lfcSE   pvalue     padj
##   <chr>                             <dbl>          <dbl> <dbl>    <dbl>    <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo…   2154.         0.372   0.594 1.93e- 1 2.32e- 1
## 2 ASV4 (Comamonadaceae; NA)          85.5        6.32    0.876 6.29e-14 4.84e-12
## 3 ASV5 (Pseudomonadaceae; Pseud…     59.4        4.37    1.15  5.91e- 7 9.10e- 6
## 4 ASV6 (Rhizobiaceae; NA)            39.2       -0.438   0.582 1.76e- 1 2.30e- 1
## 5 ASV7 (Comamonadaceae; Candida…     27.9       -0.00355 0.405 2.02e- 7 5.19e- 6
## 6 ASV8 (Methylophilaceae; Methy…     27.8        0.453   0.560 2.39e- 1 2.82e- 1

Extract significant results

sig_inoc_733B_vs_141 <- res_inoc_733B_vs_141_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_733B_vs_141, "./16S/significant_results_Inoc_733B141_vs_141_shrunken_LFC.csv", row.names = F)
sig_inoc_733B_vs_141
## # A tibble: 37 × 6
##    ASV                           baseMean log2FoldChange lfcSE   pvalue     padj
##    <chr>                            <dbl>          <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV4 (Comamonadaceae; NA)        85.5        6.32     0.876 6.29e-14 4.84e-12
##  2 ASV5 (Pseudomonadaceae; Pseu…    59.4        4.37     1.15  5.91e- 7 9.10e- 6
##  3 ASV7 (Comamonadaceae; Candid…    27.9       -0.00355  0.405 2.02e- 7 5.19e- 6
##  4 ASV9 (Rhizobiaceae; NA)          23.3        0.411    0.592 8.90e- 5 5.61e- 4
##  5 ASV14 (Sphingomonadaceae; No…    14.3        1.30     0.950 1.88e- 4 9.64e- 4
##  6 ASV15 (Azospirillaceae; Azos…    12.8       -0.00101  0.400 1.12e- 6 1.23e- 5
##  7 ASV17 (Rhizobiaceae; Allorhi…    10.7        0.252    0.478 8.10e- 3 2.01e- 2
##  8 ASV18 (Sphingomonadaceae; NA)    10.4       -0.000973 0.402 2.03e- 5 1.95e- 4
##  9 ASV20 (Sphingomonadaceae; No…     9.18      -2.24     0.783 6.23e- 4 2.67e- 3
## 10 ASV21 (Devosiaceae; Devosia)      9.01       0.874    0.872 4.10e- 3 1.22e- 2
## # ℹ 27 more rows
nrow(sig_inoc_733B_vs_141)
## [1] 37

Volcano plot of the ASVs whose abundance are significantly affected by Inoculum

log2 fold changes of 717A + 141 vs 141 contrasts

Obtain a logical vector where TRUE values denote padj values < 0.05 and fold change > 2 in either direction

log2FoldChange >= 1 since we are working with log2 fold changes so this translates to an actual fold change of 2

res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(threshold_717A141 = padj < 0.05 & abs(log2FoldChange) >= 1)
Make labels for the ASVs meeting the cutoff
res_inoc_717A_vs_141_tb$label <- res_inoc_717A_vs_141_tb$ASV
res_inoc_717A_vs_141_tb$label <- gsub("[A-Za-z]+; ", "", res_inoc_717A_vs_141_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Function to modify the label based on the ASV column
modify_label <- function(asv_value, label_value) {
  if (grepl("\\(NA\\)", label_value)) {
    asv_text <- gsub(".*?\\((.*?)\\;.*", "\\1", asv_value)
    modified_label <- gsub("NA", asv_text, label_value)
    return(modified_label)
  } else {
    return(label_value)
  }
}
# Apply the function to create a new_label column
res_inoc_717A_vs_141_tb$new_label <- mapply(modify_label, res_inoc_717A_vs_141_tb$ASV, res_inoc_717A_vs_141_tb$label)
## Remove the label if it does not meet the threshold
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label = case_when(threshold_717A141 == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_inoc_717A_vs_141_tb$new_label_2 <- gsub(" \\(.*", "", res_inoc_717A_vs_141_tb$ASV)
## Remove the label if it does not meet the threshold
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label_2 = case_when(threshold_717A141 == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_inoc_717A_vs_141_tb$new_label_4 <- res_inoc_717A_vs_141_tb$new_label_2
res_inoc_717A_vs_141_tb <- res_inoc_717A_vs_141_tb %>% mutate(new_label_4 = ifelse(
    ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)", 
    list("ASV5", new_label_2),
    ifelse(
      ASV == "ASV28 (Bacillaceae; Bacillus)", 
      list("ASV28", new_label_2),
      ifelse(
        ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
        list("ASV1", new_label_2),
        new_label_2
      )
    )
  ))
Volcano plot
volcano.inoc_717A_vs_141 <- ggplot(res_inoc_717A_vs_141_tb, aes(x = log2FoldChange, y = -log10(padj))) +
                              geom_point(aes(colour = threshold_717A141)) +
                              geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
                              xlab("log2 fold change") + 
                              ylab("-log10 adjusted p-value") +
                              theme_linedraw() +
                              ggtitle("Inoculum: 717A + 141 vs 141") +
                              theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_717A_vs_141
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).

log2 fold changes of 733B + 141 vs 141 contrasts

Obtain a logical vector where TRUE values denote padj values < 0.05 and fold change > 2 in either direction
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(threshold_733B141 = padj < 0.05 & abs(log2FoldChange) >= 1)
Make labels for the ASVs meeting the cutoff
res_inoc_733B_vs_141_tb$label <- res_inoc_733B_vs_141_tb$ASV
res_inoc_733B_vs_141_tb$label <- gsub("[A-Za-z]+; ", "", res_inoc_733B_vs_141_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function to create a new_label column
res_inoc_733B_vs_141_tb$new_label <- mapply(modify_label, res_inoc_733B_vs_141_tb$ASV, res_inoc_733B_vs_141_tb$label)
## Remove the label if it does not meet the threshold
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label = case_when(threshold_733B141 == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_inoc_733B_vs_141_tb$new_label_2 <- gsub(" \\(.*", "", res_inoc_733B_vs_141_tb$ASV)
## Remove the label if it does not meet the threshold
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label_2 = case_when(threshold_733B141 == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_inoc_733B_vs_141_tb$new_label_4 <- res_inoc_733B_vs_141_tb$new_label_2
res_inoc_733B_vs_141_tb <- res_inoc_733B_vs_141_tb %>% mutate(new_label_4 = ifelse(
    ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)", 
    list("ASV5", new_label_2),
    ifelse(
      ASV == "ASV28 (Bacillaceae; Bacillus)", 
      list("ASV28", new_label_2),
      ifelse(
        ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
        list("ASV1", new_label_2),
        new_label_2
      )
    )
  ))
Volcano plot
volcano.inoc_733B_vs_141 <- ggplot(res_inoc_733B_vs_141_tb, aes(x = log2FoldChange, y = -log10(padj))) +
                              geom_point(aes(colour = threshold_733B141)) +
                              geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
                              xlab("log2 fold change") + 
                              ylab("-log10 adjusted p-value") +
                              theme_linedraw() +
                              ggtitle("Inoculum: 733B + 141 vs 141") +
                              theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_733B_vs_141
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).

Hypothesis 2

Tissue does not significantly affect ASV abundances.

Full model: design = ~ Inoculum + Tissue + Surface.

Reduced model: design = ~ Inoculum + Surface

Test

dds_lrt_full.vs.noTissue <- DESeq(dds, test="LRT", reduced = ~ Inoculum + Surface)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing

Results

res_LRT_full.vs.noTissue <- results(dds_lrt_full.vs.noTissue)

# Create a tibble for LRT results
res_LRT_full.vs.noTissue_tb <- res_LRT_full.vs.noTissue %>%
  data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_LRT_full.vs.noTissue_tb, "./16S/LRT_test_results_Tissue.csv", row.names = F)

# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_full.vs.noTissue <- res_LRT_full.vs.noTissue_tb %>%
  filter(padj < padj.cutoff)
insigLRT_ASVs_full.vs.noTissue <- res_LRT_full.vs.noTissue_tb %>%
  filter(padj >= padj.cutoff)

Get the significant ASVs

sigLRT_ASVs_full.vs.noTissue
## # A tibble: 39 × 7
##    ASV                       baseMean log2FoldChange lfcSE  stat  pvalue    padj
##    <chr>                        <dbl>          <dbl> <dbl> <dbl>   <dbl>   <dbl>
##  1 ASV1 (Rhizobiaceae; Sino…   2154.          -0.359 0.687 15.1  1.01e-4 1.23e-3
##  2 ASV5 (Pseudomonadaceae; …     59.4         -2.35  0.635 12.2  4.67e-4 2.54e-3
##  3 ASV6 (Rhizobiaceae; NA)       39.2         -4.89  0.563 16.4  5.13e-5 8.32e-4
##  4 ASV7 (Comamonadaceae; Ca…     27.9         -2.38  0.581  7.88 5.01e-3 1.45e-2
##  5 ASV8 (Methylophilaceae; …     27.8         -5.11  0.535  6.21 1.27e-2 2.61e-2
##  6 ASV9 (Rhizobiaceae; NA)       23.3         -3.29  0.563 10.1  1.52e-3 5.79e-3
##  7 ASV11 (Azospirillaceae; …     21.8         -4.80  0.574  5.71 1.69e-2 3.12e-2
##  8 ASV15 (Azospirillaceae; …     12.8         -2.10  0.565  6.20 1.28e-2 2.61e-2
##  9 ASV16 (Caulobacteraceae;…     11.6         -3.71  0.509 12.9  3.31e-4 2.24e-3
## 10 ASV17 (Rhizobiaceae; All…     10.7         -2.45  0.600 11.0  9.15e-4 3.98e-3
## # ℹ 29 more rows
nrow(sigLRT_ASVs_full.vs.noTissue)
## [1] 39

Get number of insignificant ASVs

insigLRT_ASVs_full.vs.noTissue
## # A tibble: 22 × 7
##    ASV                         baseMean log2FoldChange lfcSE  stat pvalue   padj
##    <chr>                          <dbl>          <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 ASV4 (Comamonadaceae; NA)      85.5           -2.70 0.573 0.103 0.748  0.748 
##  2 ASV14 (Sphingomonadaceae; …    14.3           -3.61 0.517 2.31  0.128  0.140 
##  3 ASV24 (Comamonadaceae; Pse…     8.08          -2.27 0.479 2.62  0.105  0.118 
##  4 ASV28 (Bacillaceae; Bacill…     6.51          -1.91 0.536 2.98  0.0844 0.105 
##  5 ASV29 (Rhodobacteraceae; N…     5.34          -2.95 0.511 4.41  0.0358 0.0520
##  6 ASV34 (Hyphomonadaceae; SW…     4.84          -2.84 0.505 3.59  0.0581 0.0770
##  7 ASV42 (Comamonadaceae; Pel…     3.04          -1.50 0.508 0.168 0.682  0.694 
##  8 ASV45 (Xanthobacteraceae; …     2.89          -1.54 0.506 2.99  0.0840 0.105 
##  9 ASV48 (Rhizobiaceae; NA)        2.79          -1.44 0.460 3.77  0.0522 0.0708
## 10 ASV54 (Microscillaceae; Oh…     2.22          -1.58 0.472 1.46  0.226  0.238 
## # ℹ 12 more rows
nrow(insigLRT_ASVs_full.vs.noTissue)
## [1] 22

Contrasts: tissue Nodule vs Root

The differences in abundance of 39 ASVs across inoculum treatment are significant. Of those 39, which ones are differentially abundant in the Nodule samples compared to the Root samples?

Get results

res_tissue_Nod_vs_Root <- results(dds_lrt_full.vs.noTissue, contrast = c("Tissue", "Nodule", "Root"), alpha = 0.05) # Baseline is Root
head(res_tissue_Nod_vs_Root  %>% as.data.frame())
##                                                  baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.84652      3.1847272
## ASV4 (Comamonadaceae; NA)                        85.46005      0.2022541
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.39209      2.3732709
## ASV6 (Rhizobiaceae; NA)                          39.20911     -2.7495204
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.92732     -2.1627127
## ASV8 (Methylophilaceae; Methylotenera)           27.80852     -1.4060265
##                                                    lfcSE       stat
## ASV1 (Rhizobiaceae; Sinorhizobium)             0.6874251 15.1264452
## ASV4 (Comamonadaceae; NA)                      0.5559333  0.1028331
## ASV5 (Pseudomonadaceae; Pseudomonas)           0.6349567 12.2443458
## ASV6 (Rhizobiaceae; NA)                        0.5519223 16.4011054
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.5790733  7.8763500
## ASV8 (Methylophilaceae; Methylotenera)         0.5194104  6.2098710
##                                                      pvalue         padj
## ASV1 (Rhizobiaceae; Sinorhizobium)             1.005449e-04 0.0011663211
## ASV4 (Comamonadaceae; NA)                      7.484559e-01 0.7484558750
## ASV5 (Pseudomonadaceae; Pseudomonas)           4.666701e-04 0.0024194971
## ASV6 (Rhizobiaceae; NA)                        5.125533e-05 0.0007913209
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 5.008547e-03 0.0138331295
## ASV8 (Methylophilaceae; Methylotenera)         1.270399e-02 0.0247890346

Shrunken log2 foldchanges (LFC)

# Apply fold change shrinkage
res_tissue_Nod_vs_Root <- lfcShrink(dds_lrt_full.vs.noTissue, coef="Tissue_Nodule_vs_Root")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_tissue_Nod_vs_Root
## log2 fold change (MAP): Tissue Nodule vs Root 
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Inoculum + Surface' 
## DataFrame with 204 rows and 5 columns
##                                                 baseMean log2FoldChange
##                                                <numeric>      <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.8465      2.7592776
## ASV4 (Comamonadaceae; NA)                        85.4601      0.0883234
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.3921      1.5802816
## ASV6 (Rhizobiaceae; NA)                          39.2091     -2.4192687
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.9273     -1.5373577
## ...                                                  ...            ...
## ASV243 (NA; NA)                                  1.05099     0.00312418
## ASV244 (NA; NA)                                  1.05099     0.00312418
## ASV245 (NA; NA)                                  1.05099     0.00312418
## ASV246 (NA; NA)                                  1.05099     0.00312418
## ASV248 (Comamonadaceae; NA)                      1.05099     0.00312418
##                                                    lfcSE      pvalue
##                                                <numeric>   <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)              0.852680 1.00545e-04
## ASV4 (Comamonadaceae; NA)                       0.405891 7.48456e-01
## ASV5 (Pseudomonadaceae; Pseudomonas)            1.063894 4.66670e-04
## ASV6 (Rhizobiaceae; NA)                         0.680140 5.12553e-05
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)  0.832125 5.00855e-03
## ...                                                  ...         ...
## ASV243 (NA; NA)                                 0.342704     0.98894
## ASV244 (NA; NA)                                 0.342704     0.98894
## ASV245 (NA; NA)                                 0.342704     0.98894
## ASV246 (NA; NA)                                 0.342704     0.98894
## ASV248 (Comamonadaceae; NA)                     0.342704     0.98894
##                                                       padj
##                                                  <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             0.001226648
## ASV4 (Comamonadaceae; NA)                      0.748455875
## ASV5 (Pseudomonadaceae; Pseudomonas)           0.002544643
## ASV6 (Rhizobiaceae; NA)                        0.000832251
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.014548636
## ...                                                    ...
## ASV243 (NA; NA)                                         NA
## ASV244 (NA; NA)                                         NA
## ASV245 (NA; NA)                                         NA
## ASV246 (NA; NA)                                         NA
## ASV248 (Comamonadaceae; NA)                             NA

Summarize results

summary(res_tissue_Nod_vs_Root, alpha = 0.05)
## 
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 3, 1.5%
## LFC < 0 (down)     : 36, 18%
## outliers [1]       : 20, 9.8%
## low counts [2]     : 123, 60%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Save results

res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root %>%
  as.data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_tissue_Nod_vs_Root_tb, "./16S/test_results_Tissue_Nodule_vs_Root_shrunken_LFC.csv", row.names = F)
head(res_tissue_Nod_vs_Root_tb)
## # A tibble: 6 × 6
##   ASV                              baseMean log2FoldChange lfcSE  pvalue    padj
##   <chr>                               <dbl>          <dbl> <dbl>   <dbl>   <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizobi…   2154.          2.76   0.853 1.01e-4 1.23e-3
## 2 ASV4 (Comamonadaceae; NA)            85.5         0.0883 0.406 7.48e-1 7.48e-1
## 3 ASV5 (Pseudomonadaceae; Pseudom…     59.4         1.58   1.06  4.67e-4 2.54e-3
## 4 ASV6 (Rhizobiaceae; NA)              39.2        -2.42   0.680 5.13e-5 8.32e-4
## 5 ASV7 (Comamonadaceae; Candidatu…     27.9        -1.54   0.832 5.01e-3 1.45e-2
## 6 ASV8 (Methylophilaceae; Methylo…     27.8        -1.01   0.592 1.27e-2 2.61e-2

Extract significant results

sig_inoc_Nod_vs_Root <- res_tissue_Nod_vs_Root_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_Nod_vs_Root, "./16S/significant_results_Tissue_Nodule_vs_Root_shrunken_LFC.csv", row.names = F)
sig_inoc_Nod_vs_Root
## # A tibble: 39 × 6
##    ASV                             baseMean log2FoldChange lfcSE  pvalue    padj
##    <chr>                              <dbl>          <dbl> <dbl>   <dbl>   <dbl>
##  1 ASV1 (Rhizobiaceae; Sinorhizob…   2154.            2.76 0.853 1.01e-4 1.23e-3
##  2 ASV5 (Pseudomonadaceae; Pseudo…     59.4           1.58 1.06  4.67e-4 2.54e-3
##  3 ASV6 (Rhizobiaceae; NA)             39.2          -2.42 0.680 5.13e-5 8.32e-4
##  4 ASV7 (Comamonadaceae; Candidat…     27.9          -1.54 0.832 5.01e-3 1.45e-2
##  5 ASV8 (Methylophilaceae; Methyl…     27.8          -1.01 0.592 1.27e-2 2.61e-2
##  6 ASV9 (Rhizobiaceae; NA)             23.3          -1.70 0.715 1.52e-3 5.79e-3
##  7 ASV11 (Azospirillaceae; Azospi…     21.8          -1.02 0.675 1.69e-2 3.12e-2
##  8 ASV15 (Azospirillaceae; Azospi…     12.8          -1.15 0.724 1.28e-2 2.61e-2
##  9 ASV16 (Caulobacteraceae; Pheny…     11.6          -1.75 0.603 3.31e-4 2.24e-3
## 10 ASV17 (Rhizobiaceae; Allorhizo…     10.7          -1.97 0.782 9.15e-4 3.98e-3
## # ℹ 29 more rows
nrow(sig_inoc_Nod_vs_Root)
## [1] 39

Volcano plot of the ASVs whose abundance are significantly affected by Tissue

log2 fold changes of Nodule vs Root contrasts

Obtain a logical vector where TRUE values denote padj values < 0.05 and fold change > 2 in either direction
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(threshold_NodRoot = padj < 0.05 & abs(log2FoldChange) >= 1)
Make labels for the ASVs meeting the cutoff
res_tissue_Nod_vs_Root_tb$label <- res_tissue_Nod_vs_Root_tb$ASV
res_tissue_Nod_vs_Root_tb$label <- gsub("[A-Za-z]+; ", "", res_tissue_Nod_vs_Root_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_tissue_Nod_vs_Root_tb$new_label <- mapply(modify_label, res_tissue_Nod_vs_Root_tb$ASV, res_tissue_Nod_vs_Root_tb$label)
## Remove the label if it does not meet the threshold
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label = case_when(threshold_NodRoot == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_tissue_Nod_vs_Root_tb$new_label_2 <- gsub(" \\(.*", "", res_tissue_Nod_vs_Root_tb$ASV)
## Remove the label if it does not meet the threshold
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label_2 = case_when(threshold_NodRoot == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_tissue_Nod_vs_Root_tb$new_label_4 <- res_tissue_Nod_vs_Root_tb$new_label_2
res_tissue_Nod_vs_Root_tb <- res_tissue_Nod_vs_Root_tb %>% mutate(new_label_4 = ifelse(
    ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)", 
    list("ASV5", new_label_2),
    ifelse(
      ASV == "ASV28 (Bacillaceae; Bacillus)", 
      list("ASV28", new_label_2),
      ifelse(
        ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
        list("ASV1", new_label_2),
        new_label_2
      )
    )
  ))
Volcano plot
volcano.inoc_Nod_vs_Root <- ggplot(res_tissue_Nod_vs_Root_tb, aes(x = log2FoldChange, y = -log10(padj))) +
                              geom_point(aes(colour = threshold_NodRoot)) +
                              geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
                              xlab("log2 fold change") + 
                              ylab("-log10 adjusted p-value") +
                              theme_linedraw() +
                              ggtitle("Nodule vs Root") +
                              theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_Nod_vs_Root
## Warning: Removed 143 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).

Hypothesis 3

Surface sterilization does not significantly affect ASV abundances.

Full model: design = ~ Inoculum + Tissue + Surface.

Reduced model: design = ~ Inoculum + Tissue

Test

dds_lrt_full.vs.noSurface <- DESeq(dds, test="LRT", reduced = ~ Inoculum + Tissue)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing

Results

res_LRT_full.vs.noSurface <- results(dds_lrt_full.vs.noSurface)

# Create a tibble for LRT results
res_LRT_full.vs.noSurface_tb <- res_LRT_full.vs.noSurface %>%
  data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_LRT_full.vs.noSurface_tb, "./16S/LRT_test_results_Surface.csv", row.names = F)

# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_full.vs.noSurface <- res_LRT_full.vs.noSurface_tb %>%
  filter(padj < padj.cutoff)
insigLRT_ASVs_full.vs.noSurface <- res_LRT_full.vs.noSurface_tb %>%
  filter(padj >= padj.cutoff)

Get the significant ASVs

sigLRT_ASVs_full.vs.noSurface
## # A tibble: 47 × 7
##    ASV                     baseMean log2FoldChange lfcSE  stat   pvalue     padj
##    <chr>                      <dbl>          <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV4 (Comamonadaceae; …     85.5          -2.70 0.573 11.0  9.16e- 4 3.78e- 3
##  2 ASV5 (Pseudomonadaceae…     59.4          -2.35 0.635 12.1  5.02e- 4 2.49e- 3
##  3 ASV6 (Rhizobiaceae; NA)     39.2          -4.89 0.563 50.9  9.58e-13 3.35e-11
##  4 ASV7 (Comamonadaceae; …     27.9          -2.38 0.581  8.86 2.92e- 3 1.02e- 2
##  5 ASV8 (Methylophilaceae…     27.8          -5.11 0.535 74.9  4.90e-18 5.15e-16
##  6 ASV9 (Rhizobiaceae; NA)     23.3          -3.29 0.563 22.5  2.13e- 6 1.83e- 5
##  7 ASV11 (Azospirillaceae…     21.8          -4.80 0.574 54.2  1.80e-13 9.46e-12
##  8 ASV14 (Sphingomonadace…     14.3          -3.61 0.517 35.6  2.38e- 9 5.00e- 8
##  9 ASV15 (Azospirillaceae…     12.8          -2.10 0.565  8.49 3.58e- 3 1.17e- 2
## 10 ASV16 (Caulobacteracea…     11.6          -3.71 0.509 44.4  2.67e-11 7.01e-10
## # ℹ 37 more rows
nrow(sigLRT_ASVs_full.vs.noSurface)
## [1] 47

Get number of insignificant ASVs

insigLRT_ASVs_full.vs.noSurface
## # A tibble: 58 × 7
##    ASV                         baseMean log2FoldChange lfcSE  stat pvalue   padj
##    <chr>                          <dbl>          <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 ASV1 (Rhizobiaceae; Sinorh…  2154.           -0.359 0.687 0.210 0.647  0.647 
##  2 ASV61 (Caulobacteraceae; C…     2.10         -1.08  0.470 4.86  0.0275 0.0603
##  3 ASV64 (Rhizobiales Incerta…     1.95         -1.02  0.460 4.65  0.0310 0.0652
##  4 ASV65 (Xanthomonadaceae; P…     1.95         -1.03  0.456 4.81  0.0282 0.0605
##  5 ASV69 (Flavobacteriaceae; …     1.84         -0.968 0.455 4.36  0.0368 0.0757
##  6 ASV71 (Rhizobiaceae; NA)        1.77         -0.933 0.447 4.25  0.0392 0.0792
##  7 ASV74 (Reyranellaceae; Rey…     1.72         -0.894 0.450 3.87  0.0492 0.0957
##  8 ASV81 (Caulobacteraceae; A…     1.55         -0.566 0.417 1.84  0.175  0.259 
##  9 ASV82 (Xanthobacteraceae; …     1.53         -0.865 0.428 4.16  0.0413 0.0819
## 10 ASV84 (Burkholderiaceae; L…     1.54         -0.753 0.452 2.74  0.0977 0.180 
## # ℹ 48 more rows
nrow(insigLRT_ASVs_full.vs.noSurface)
## [1] 58

Contrasts: tissue Nodule vs Root

The differences in abundance of 47 ASVs across inoculum treatment are significant. Of those 47, which ones are differentially abundant in the Nodule samples compared to the Root samples?

Get results

res_tissue_Sterile_vs_Not <- results(dds_lrt_full.vs.noSurface, contrast = c("Surface", "Surface-sterilized", "Not surface-sterilized"), alpha = 0.05) # Baseline is Not surface-sterilized
head(res_tissue_Sterile_vs_Not  %>% as.data.frame())
##                                                  baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.84652     -0.3593853
## ASV4 (Comamonadaceae; NA)                        85.46005     -2.7042808
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.39209     -2.3521719
## ASV6 (Rhizobiaceae; NA)                          39.20911     -4.8897374
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.92732     -2.3764809
## ASV8 (Methylophilaceae; Methylotenera)           27.80852     -5.1073218
##                                                    lfcSE       stat
## ASV1 (Rhizobiaceae; Sinorhizobium)             0.6874250  0.2097697
## ASV4 (Comamonadaceae; NA)                      0.5728428 10.9906257
## ASV5 (Pseudomonadaceae; Pseudomonas)           0.6349587 12.1099015
## ASV6 (Rhizobiaceae; NA)                        0.5634659 50.9278125
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.5810043  8.8564065
## ASV8 (Methylophilaceae; Methylotenera)         0.5345081 74.9194790
##                                                      pvalue         padj
## ASV1 (Rhizobiaceae; Sinorhizobium)             6.469479e-01 6.469479e-01
## ASV4 (Comamonadaceae; NA)                      9.157389e-04 2.337995e-03
## ASV5 (Pseudomonadaceae; Pseudomonas)           5.015479e-04 1.540387e-03
## ASV6 (Rhizobiaceae; NA)                        9.582611e-13 2.076232e-11
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 2.920620e-03 6.328010e-03
## ASV8 (Methylophilaceae; Methylotenera)         4.903086e-18 3.187006e-16

Shrunken log2 foldchanges (LFC)

# Apply fold change shrinkage
res_tissue_Sterile_vs_Not <- lfcShrink(dds_lrt_full.vs.noSurface, coef="Surface_Surface.sterilized_vs_Not.surface.sterilized")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_tissue_Sterile_vs_Not
## log2 fold change (MAP): Surface Surface.sterilized vs Not.surface.sterilized 
## LRT p-value: '~ Inoculum + Tissue + Surface' vs '~ Inoculum + Tissue' 
## DataFrame with 204 rows and 5 columns
##                                                 baseMean log2FoldChange
##                                                <numeric>      <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.8465      -0.187296
## ASV4 (Comamonadaceae; NA)                        85.4601      -2.157491
## ASV5 (Pseudomonadaceae; Pseudomonas)             59.3921      -1.731137
## ASV6 (Rhizobiaceae; NA)                          39.2091      -4.697198
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)   27.9273      -1.846352
## ...                                                  ...            ...
## ASV243 (NA; NA)                                  1.05099     0.00933195
## ASV244 (NA; NA)                                  1.05099     0.00933195
## ASV245 (NA; NA)                                  1.05099     0.00933195
## ASV246 (NA; NA)                                  1.05099     0.00933195
## ASV248 (Comamonadaceae; NA)                      1.05099     0.00933195
##                                                    lfcSE      pvalue
##                                                <numeric>   <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)              0.576363 6.46948e-01
## ASV4 (Comamonadaceae; NA)                       0.854205 9.15739e-04
## ASV5 (Pseudomonadaceae; Pseudomonas)            0.953002 5.01548e-04
## ASV6 (Rhizobiaceae; NA)                         0.654391 9.58261e-13
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)  0.806849 2.92062e-03
## ...                                                  ...         ...
## ASV243 (NA; NA)                                   0.3866    0.979122
## ASV244 (NA; NA)                                   0.3866    0.979122
## ASV245 (NA; NA)                                   0.3866    0.979122
## ASV246 (NA; NA)                                   0.3866    0.979122
## ASV248 (Comamonadaceae; NA)                       0.3866    0.979122
##                                                       padj
##                                                  <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             6.46948e-01
## ASV4 (Comamonadaceae; NA)                      3.77676e-03
## ASV5 (Pseudomonadaceae; Pseudomonas)           2.48832e-03
## ASV6 (Rhizobiaceae; NA)                        3.35391e-11
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 1.02222e-02
## ...                                                    ...
## ASV243 (NA; NA)                                         NA
## ASV244 (NA; NA)                                         NA
## ASV245 (NA; NA)                                         NA
## ASV246 (NA; NA)                                         NA
## ASV248 (Comamonadaceae; NA)                             NA

Summarize results

summary(res_tissue_Sterile_vs_Not, alpha = 0.05)
## 
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1, 0.49%
## LFC < 0 (down)     : 46, 23%
## outliers [1]       : 20, 9.8%
## low counts [2]     : 79, 39%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Save results

res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not %>%
  as.data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_tissue_Sterile_vs_Not_tb, "./16S/test_results_Surface_Sterile_vs_NotSterile_shrunken_LFC.csv", row.names = F)
head(res_tissue_Sterile_vs_Not_tb)
## # A tibble: 6 × 6
##   ASV                            baseMean log2FoldChange lfcSE   pvalue     padj
##   <chr>                             <dbl>          <dbl> <dbl>    <dbl>    <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo…   2154.          -0.187 0.576 6.47e- 1 6.47e- 1
## 2 ASV4 (Comamonadaceae; NA)          85.5         -2.16  0.854 9.16e- 4 3.78e- 3
## 3 ASV5 (Pseudomonadaceae; Pseud…     59.4         -1.73  0.953 5.02e- 4 2.49e- 3
## 4 ASV6 (Rhizobiaceae; NA)            39.2         -4.70  0.654 9.58e-13 3.35e-11
## 5 ASV7 (Comamonadaceae; Candida…     27.9         -1.85  0.807 2.92e- 3 1.02e- 2
## 6 ASV8 (Methylophilaceae; Methy…     27.8         -4.99  0.546 4.90e-18 5.15e-16

Extract significant results

sig_inoc_Sterile_vs_Not <- res_tissue_Sterile_vs_Not_tb %>% filter(padj < 0.05)
write.csv(sig_inoc_Sterile_vs_Not, "./16S/significant_results_Surface_Sterile_vs_NotSterile_shrunken_LFC.csv", row.names = F)
sig_inoc_Sterile_vs_Not
## # A tibble: 47 × 6
##    ASV                           baseMean log2FoldChange lfcSE   pvalue     padj
##    <chr>                            <dbl>          <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV4 (Comamonadaceae; NA)         85.5          -2.16 0.854 9.16e- 4 3.78e- 3
##  2 ASV5 (Pseudomonadaceae; Pseu…     59.4          -1.73 0.953 5.02e- 4 2.49e- 3
##  3 ASV6 (Rhizobiaceae; NA)           39.2          -4.70 0.654 9.58e-13 3.35e-11
##  4 ASV7 (Comamonadaceae; Candid…     27.9          -1.85 0.807 2.92e- 3 1.02e- 2
##  5 ASV8 (Methylophilaceae; Meth…     27.8          -4.99 0.546 4.90e-18 5.15e-16
##  6 ASV9 (Rhizobiaceae; NA)           23.3          -3.00 0.688 2.13e- 6 1.83e- 5
##  7 ASV11 (Azospirillaceae; Azos…     21.8          -4.64 0.614 1.80e-13 9.46e-12
##  8 ASV14 (Sphingomonadaceae; No…     14.3          -3.41 0.586 2.38e- 9 5.00e- 8
##  9 ASV15 (Azospirillaceae; Azos…     12.8          -1.66 0.725 3.58e- 3 1.17e- 2
## 10 ASV16 (Caulobacteraceae; Phe…     11.6          -3.55 0.547 2.67e-11 7.01e-10
## # ℹ 37 more rows
nrow(sig_inoc_Sterile_vs_Not)
## [1] 47

Volcano plot of the ASVs whose abundance are significantly affected by Surface

log2 fold changes of Surface-sterile. vs Not surface-sterile. contrasts

Obtain a logical vector where TRUE values denote padj values < 0.05 and fold change > 2 in either direction
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(threshold_Sterile = padj < 0.05 & abs(log2FoldChange) >= 1)
Make labels for the ASVs meeting the cutoff
res_tissue_Sterile_vs_Not_tb$label <- res_tissue_Sterile_vs_Not_tb$ASV
res_tissue_Sterile_vs_Not_tb$label <- gsub("[A-Za-z]+; ", "", res_tissue_Sterile_vs_Not_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_tissue_Sterile_vs_Not_tb$new_label <- mapply(modify_label, res_tissue_Sterile_vs_Not_tb$ASV, res_tissue_Sterile_vs_Not_tb$label)
## Remove the label if it does not meet the threshold
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label = case_when(threshold_Sterile == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_tissue_Sterile_vs_Not_tb$new_label_2 <- gsub(" \\(.*", "", res_tissue_Sterile_vs_Not_tb$ASV)
## Remove the label if it does not meet the threshold
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label_2 = case_when(threshold_Sterile == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_tissue_Sterile_vs_Not_tb$new_label_4 <- res_tissue_Sterile_vs_Not_tb$new_label_2
res_tissue_Sterile_vs_Not_tb <- res_tissue_Sterile_vs_Not_tb %>% mutate(new_label_4 = ifelse(
    ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)", 
    list("ASV5", new_label_2),
    ifelse(
      ASV == "ASV28 (Bacillaceae; Bacillus)", 
      list("ASV28", new_label_2),
      ifelse(
        ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
        list("ASV1", new_label_2),
        new_label_2
      )
    )
  ))
Volcano plot
volcano.inoc_Sterile_vs_Not <- ggplot(res_tissue_Sterile_vs_Not_tb, aes(x = log2FoldChange, y = -log10(padj))) +
                              geom_point(aes(colour = threshold_Sterile)) +
                              geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
                              xlab("log2 fold change") + 
                              ylab("-log10 adjusted p-value") +
                              theme_linedraw() +
                              ggtitle("Surface-sterilized vs not surface-sterilized") +
                              theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.inoc_Sterile_vs_Not
## Warning: Removed 99 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).

What about the interaction between Inoculum, Tissue, and Surface?

I can also model and test the abundances of ASVs as influenced by the interaction of Tissue and Surface. This would be useful because I could contrast groups of ASVs in the same Tissue treatment but different Surface treatments.

Add an interactions

dds_interact <- dds
dds_interact$TissueSurface <- factor(paste0(dds_interact$Tissue, dds_interact$Surface))
design(dds_interact) <- ~ TissueSurface
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
dds_interact <- DESeq(dds_interact)
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 34 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

Hypothesis 4

The interactions of Tissue and Surface does not significantly affect ASV abundances.

Full model: design = ~TissueSurface.

Reduced model: design = ~ 1 (the intercept)

Test

dds_lrt_Interact.vs.Intercept <- DESeq(dds_interact, test="LRT", reduced = ~ 1) # reduced model is just the intercept
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 34 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

Results

res_LRT_Interact.vs.Intercept <- results(dds_lrt_Interact.vs.Intercept)

# Create a tibble for LRT results
res_LRT_Interact.vs.Intercept_tb <- res_LRT_Interact.vs.Intercept %>%
  data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_LRT_Interact.vs.Intercept_tb, "./16S/LRT_test_results_Interact_vs_Intercept.csv", row.names = F)

# Subset to return ASVs with padj < 0.05
padj.cutoff <- 0.05 # Set alpha to 0.05
sigLRT_ASVs_Interact.vs.Intercept <- res_LRT_Interact.vs.Intercept_tb %>%
  filter(padj < padj.cutoff)
insigLRT_ASVs_Interact.vs.Intercept <- res_LRT_Interact.vs.Intercept_tb %>%
  filter(padj >= padj.cutoff)

Get the significant ASVs

sigLRT_ASVs_Interact.vs.Intercept
## # A tibble: 60 × 7
##    ASV                     baseMean log2FoldChange lfcSE  stat   pvalue     padj
##    <chr>                      <dbl>          <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV1 (Rhizobiaceae; Si…  2154.         -2.62    0.987  12.2 6.88e- 3 1.40e- 2
##  2 ASV4 (Comamonadaceae; …    27.1        -6.28    1.03   48.4 1.78e-10 9.21e-10
##  3 ASV5 (Pseudomonadaceae…     7.36       -4.64    0.791  71.4 2.09e-15 1.58e-14
##  4 ASV6 (Rhizobiaceae; NA)    39.2        -2.64    0.795 136.  3.25e-29 3.18e-27
##  5 ASV7 (Comamonadaceae; …     8.89       -1.22    0.881  59.5 7.64e-13 4.41e-12
##  6 ASV8 (Methylophilaceae…    27.8        -4.03    0.737  81.5 1.49e-17 1.62e-16
##  7 ASV9 (Rhizobiaceae; NA)    23.3        -1.66    0.873  97.0 6.85e-21 1.12e-19
##  8 ASV10 (Comamonadaceae;…     3.58        0.00540 0.844  43.1 2.32e- 9 9.88e- 9
##  9 ASV11 (Azospirillaceae…    21.8        -3.75    0.814  83.0 6.89e-18 8.44e-17
## 10 ASV13 (Rhizobiaceae; N…     4.28       -3.57    0.732  41.2 5.99e- 9 2.17e- 8
## # ℹ 50 more rows
nrow(sigLRT_ASVs_Interact.vs.Intercept)
## [1] 60

Get number of insignificant ASVs

insigLRT_ASVs_Interact.vs.Intercept
## # A tibble: 38 × 7
##    ASV                         baseMean log2FoldChange lfcSE  stat pvalue   padj
##    <chr>                          <dbl>          <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 ASV18 (Sphingomonadaceae; …     1.57       -0.657   0.654  6.05 0.109  0.151 
##  2 ASV32 (Beijerinckiaceae; B…     1.44        0.00540 0.682  8.27 0.0408 0.0624
##  3 ASV36 (Comamonadaceae; Pse…     1.53       -0.731   0.626  5.11 0.164  0.208 
##  4 ASV40 (Sphingomonadaceae; …     1.42       -1.27    0.578  8.46 0.0373 0.0581
##  5 ASV42 (Comamonadaceae; Pel…     1.31       -0.994   0.582  4.94 0.176  0.218 
##  6 ASV45 (Xanthobacteraceae; …     1.61       -0.579   0.668  7.41 0.0600 0.0891
##  7 ASV61 (Caulobacteraceae; C…     1.29        0.00540 0.647  4.20 0.241  0.281 
##  8 ASV94 (Xanthobacteraceae; …     1.46        0.00540 0.686  8.81 0.0320 0.0505
##  9 ASV95 (Beijerinckiaceae; B…     1.46        0.00540 0.687  8.83 0.0316 0.0505
## 10 ASV97 (Stappiaceae; NA)         1.40        0.00540 0.655  7.65 0.0538 0.0811
## # ℹ 28 more rows
nrow(insigLRT_ASVs_Interact.vs.Intercept)
## [1] 38

Contrasts: surface-sterilized nodule samples vs not surface-sterilized nodule samples

Get results

res_interact_Nod_Sterile_vs_Nod_Not <- results(dds_lrt_Interact.vs.Intercept, contrast = c("TissueSurface", "NoduleSurface sterilized", "NoduleNot surface-sterilized"), alpha = 0.05) # Baseline is not surface-sterilized Nodule samples
head(res_interact_Nod_Sterile_vs_Nod_Not  %>% as.data.frame())
##                                                   baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.846522      0.2294456
## ASV4 (Comamonadaceae; NA)                        27.144536     -6.2809012
## ASV5 (Pseudomonadaceae; Pseudomonas)              7.363803     -4.6396968
## ASV6 (Rhizobiaceae; NA)                          39.209107     -2.6365163
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)    8.892439     -1.2156154
## ASV8 (Methylophilaceae; Methylotenera)           27.808524     -4.9053165
##                                                    lfcSE      stat       pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium)             0.9868250  12.15318 6.876427e-03
## ASV4 (Comamonadaceae; NA)                      1.0320566  48.36033 1.784799e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)           0.7909397  71.44332 2.094871e-15
## ASV6 (Rhizobiaceae; NA)                        0.7949539 135.66803 3.246443e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.8805514  59.46638 7.642626e-13
## ASV8 (Methylophilaceae; Methylotenera)         0.7885698  81.46230 1.490531e-17
##                                                        padj
## ASV1 (Rhizobiaceae; Sinorhizobium)             1.160397e-02
## ASV4 (Comamonadaceae; NA)                      7.608882e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)           1.305266e-14
## ASV6 (Rhizobiaceae; NA)                        2.629619e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 3.641487e-12
## ASV8 (Methylophilaceae; Methylotenera)         1.341478e-16

Shrunken log2 foldchanges (LFC)

# Apply fold change shrinkage
res_interact_Nod_Sterile_vs_Nod_Not <- lfcShrink(dds_lrt_Interact.vs.Intercept, coef="TissueSurface_NoduleSurface.sterilized_vs_NoduleNot.surface.sterilized")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_interact_Nod_Sterile_vs_Nod_Not
## log2 fold change (MAP): TissueSurface NoduleSurface.sterilized vs NoduleNot.surface.sterilized 
## LRT p-value: '~ TissueSurface' vs '~ 1' 
## DataFrame with 204 rows and 5 columns
##                                                  baseMean log2FoldChange
##                                                 <numeric>      <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.84652      0.0118120
## ASV4 (Comamonadaceae; NA)                        27.14454     -7.1927280
## ASV5 (Pseudomonadaceae; Pseudomonas)              7.36380     -7.6231874
## ASV6 (Rhizobiaceae; NA)                          39.20911     -2.0349494
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)    8.89244     -0.0758511
## ...                                                   ...            ...
## ASV243 (NA; NA)                                   1.05099   -4.36247e-05
## ASV244 (NA; NA)                                   1.05099   -4.36247e-05
## ASV245 (NA; NA)                                   1.05099   -4.36247e-05
## ASV246 (NA; NA)                                   1.05099   -4.36247e-05
## ASV248 (Comamonadaceae; NA)                       1.05099   -4.36247e-05
##                                                    lfcSE      pvalue
##                                                <numeric>   <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)              0.215657 6.87643e-03
## ASV4 (Comamonadaceae; NA)                       1.051241 1.78480e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)            0.791297 2.09487e-15
## ASV6 (Rhizobiaceae; NA)                         0.930714 3.24644e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)  0.232491 7.64263e-13
## ...                                                  ...         ...
## ASV243 (NA; NA)                                 0.207625    0.999991
## ASV244 (NA; NA)                                 0.207625    0.999991
## ASV245 (NA; NA)                                 0.207625    0.999991
## ASV246 (NA; NA)                                 0.207625    0.999991
## ASV248 (Comamonadaceae; NA)                     0.207625    0.999991
##                                                       padj
##                                                  <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             1.40394e-02
## ASV4 (Comamonadaceae; NA)                      9.20581e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)           1.57921e-14
## ASV6 (Rhizobiaceae; NA)                        3.18151e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 4.40575e-12
## ...                                                    ...
## ASV243 (NA; NA)                                         NA
## ASV244 (NA; NA)                                         NA
## ASV245 (NA; NA)                                         NA
## ASV246 (NA; NA)                                         NA
## ASV248 (Comamonadaceae; NA)                             NA

Summarize results

summary(res_interact_Nod_Sterile_vs_Nod_Not, alpha = 0.05)
## 
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 31, 15%
## LFC < 0 (down)     : 29, 14%
## outliers [1]       : 0, 0%
## low counts [2]     : 106, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Save results

res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not %>%
  as.data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_interact_Nod_Sterile_vs_Nod_Not_tb, "./16S/test_results_Nod_Sterile_vs_Nod_NotSterile_shrunken_LFC.csv", row.names = F)
head(res_interact_Nod_Sterile_vs_Nod_Not_tb)
## # A tibble: 6 × 6
##   ASV                            baseMean log2FoldChange lfcSE   pvalue     padj
##   <chr>                             <dbl>          <dbl> <dbl>    <dbl>    <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo…  2154.           0.0118 0.216 6.88e- 3 1.40e- 2
## 2 ASV4 (Comamonadaceae; NA)         27.1         -7.19   1.05  1.78e-10 9.21e-10
## 3 ASV5 (Pseudomonadaceae; Pseud…     7.36        -7.62   0.791 2.09e-15 1.58e-14
## 4 ASV6 (Rhizobiaceae; NA)           39.2         -2.03   0.931 3.25e-29 3.18e-27
## 5 ASV7 (Comamonadaceae; Candida…     8.89        -0.0759 0.232 7.64e-13 4.41e-12
## 6 ASV8 (Methylophilaceae; Methy…    27.8         -4.64   0.807 1.49e-17 1.62e-16

Extract significant results

sig_interact_Nod_Sterile_vs_Nod_Not <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% filter(padj < 0.05)
write.csv(sig_interact_Nod_Sterile_vs_Nod_Not, "./16S/significant_results_sig_interact_Nod_Sterile_vs_Nod_Not_shrunken_LFC.csv", row.names = F)
sig_interact_Nod_Sterile_vs_Nod_Not
## # A tibble: 60 × 6
##    ASV                           baseMean log2FoldChange lfcSE   pvalue     padj
##    <chr>                            <dbl>          <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV1 (Rhizobiaceae; Sinorhiz…  2154.        0.0118    0.216 6.88e- 3 1.40e- 2
##  2 ASV4 (Comamonadaceae; NA)        27.1      -7.19      1.05  1.78e-10 9.21e-10
##  3 ASV5 (Pseudomonadaceae; Pseu…     7.36     -7.62      0.791 2.09e-15 1.58e-14
##  4 ASV6 (Rhizobiaceae; NA)          39.2      -2.03      0.931 3.25e-29 3.18e-27
##  5 ASV7 (Comamonadaceae; Candid…     8.89     -0.0759    0.232 7.64e-13 4.41e-12
##  6 ASV8 (Methylophilaceae; Meth…    27.8      -4.64      0.807 1.49e-17 1.62e-16
##  7 ASV9 (Rhizobiaceae; NA)          23.3      -0.109     0.254 6.85e-21 1.12e-19
##  8 ASV10 (Comamonadaceae; Azohy…     3.58      0.0000515 0.213 2.32e- 9 9.88e- 9
##  9 ASV11 (Azospirillaceae; Azos…    21.8      -3.35      0.860 6.89e-18 8.44e-17
## 10 ASV13 (Rhizobiaceae; NA)          4.28     -3.24      0.763 5.99e- 9 2.17e- 8
## # ℹ 50 more rows
nrow(sig_interact_Nod_Sterile_vs_Nod_Not)
## [1] 60

Volcano plot

log2 fold changes of Nodule, Surface-sterile. vs Nodule, Not surface-sterile. contrasts

Obtain a logical vector where TRUE values denote padj values < 0.05 and fold change > 2 in either direction
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(threshold_TissueSurface = padj < 0.05 & abs(log2FoldChange) >= 1)
Make labels for the ASVs meeting the cutoff
res_interact_Nod_Sterile_vs_Nod_Not_tb$label <- res_interact_Nod_Sterile_vs_Nod_Not_tb$ASV
res_interact_Nod_Sterile_vs_Nod_Not_tb$label <- gsub("[A-Za-z]+; ", "", res_interact_Nod_Sterile_vs_Nod_Not_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label <- mapply(modify_label, res_interact_Nod_Sterile_vs_Nod_Not_tb$ASV, res_interact_Nod_Sterile_vs_Nod_Not_tb$label)
## Remove the label if it does not meet the threshold
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label = case_when(threshold_TissueSurface == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label_2 <- gsub(" \\(.*", "", res_interact_Nod_Sterile_vs_Nod_Not_tb$ASV)
## Remove the label if it does not meet the threshold
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label_2 = case_when(threshold_TissueSurface == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label_4 <- res_interact_Nod_Sterile_vs_Nod_Not_tb$new_label_2
res_interact_Nod_Sterile_vs_Nod_Not_tb <- res_interact_Nod_Sterile_vs_Nod_Not_tb %>% mutate(new_label_4 = ifelse(
    ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)", 
    list("ASV5", new_label_2),
    ifelse(
      ASV == "ASV28 (Bacillaceae; Bacillus)", 
      list("ASV28", new_label_2),
      ifelse(
        ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
        list("ASV1", new_label_2),
        new_label_2
      )
    )
  ))
Volcano plot
volcano.interact_NodSterile_vs_NodNot <- ggplot(res_interact_Nod_Sterile_vs_Nod_Not_tb, aes(x = log2FoldChange, y = -log10(padj))) +
                                          geom_point(aes(colour = threshold_TissueSurface)) +
                                          geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
                                          xlab("log2 fold change") + 
                                          ylab("-log10 adjusted p-value") +
                                          theme_linedraw() +
                                          ggtitle("Nodule, surface-sterilized vs nodule, not surface-sterilized") +
                                          theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.interact_NodSterile_vs_NodNot
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).

Contrasts: surface-sterilized root samples vs not surface-sterilized root samples

This requires me to re-factor the “Group” variable so I can change the base for the comparisons:

dds_interact$TissueSurface <- relevel(dds_interact$TissueSurface, ref = "RootNot surface-sterilized")
dds_lrt_Interact.vs.Intercept <- DESeq(dds_interact, test="LRT", reduced = ~ 1) # reduced model is just the intercept
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 34 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

Get results

res_interact_Root_Sterile_vs_Root_Not <- results(dds_lrt_Interact.vs.Intercept, contrast = c("TissueSurface", "RootSurface sterilized", "RootNot surface-sterilized"), alpha = 0.05) # Baseline is not surface-sterilized Root samples
head(res_interact_Root_Sterile_vs_Root_Not  %>% as.data.frame())
##                                                   baseMean log2FoldChange
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.846522    -0.33507335
## ASV4 (Comamonadaceae; NA)                        27.144536    -4.53231774
## ASV5 (Pseudomonadaceae; Pseudomonas)              7.363803     0.01739461
## ASV6 (Rhizobiaceae; NA)                          39.209107    -7.13371069
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)    8.892439    -4.87741815
## ASV8 (Methylophilaceae; Methylotenera)           27.808524    -5.30882147
##                                                    lfcSE      stat       pvalue
## ASV1 (Rhizobiaceae; Sinorhizobium)             0.9870736  12.15318 6.876427e-03
## ASV4 (Comamonadaceae; NA)                      1.0345649  48.36033 1.784799e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)           0.8900004  71.44332 2.094871e-15
## ASV6 (Rhizobiaceae; NA)                        0.7780653 135.66803 3.246443e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 0.8407156  59.46638 7.642626e-13
## ASV8 (Methylophilaceae; Methylotenera)         0.7346109  81.46230 1.490531e-17
##                                                        padj
## ASV1 (Rhizobiaceae; Sinorhizobium)             1.160397e-02
## ASV4 (Comamonadaceae; NA)                      7.608882e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)           1.305266e-14
## ASV6 (Rhizobiaceae; NA)                        2.629619e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 3.641487e-12
## ASV8 (Methylophilaceae; Methylotenera)         1.341478e-16

Shrunken log2 foldchanges (LFC)

# Save the unshrunken results to compare
res_interact_Root_Sterile_vs_Root_Not_unshrunken <- res_interact_Root_Sterile_vs_Root_Not

# Apply fold change shrinkage
res_interact_Root_Sterile_vs_Root_Not <- lfcShrink(dds_lrt_Interact.vs.Intercept, coef="TissueSurface_RootSurface.sterilized_vs_RootNot.surface.sterilized")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
res_interact_Root_Sterile_vs_Root_Not
## log2 fold change (MAP): TissueSurface RootSurface.sterilized vs RootNot.surface.sterilized 
## LRT p-value: '~ TissueSurface' vs '~ 1' 
## DataFrame with 204 rows and 5 columns
##                                                  baseMean log2FoldChange
##                                                 <numeric>      <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             2153.84652    -0.17252697
## ASV4 (Comamonadaceae; NA)                        27.14454    -6.82583471
## ASV5 (Pseudomonadaceae; Pseudomonas)              7.36380     0.00649048
## ASV6 (Rhizobiaceae; NA)                          39.20911    -6.95908415
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)    8.89244    -6.44955796
## ...                                                   ...            ...
## ASV243 (NA; NA)                                   1.05099      0.0168483
## ASV244 (NA; NA)                                   1.05099      0.0168483
## ASV245 (NA; NA)                                   1.05099      0.0168483
## ASV246 (NA; NA)                                   1.05099      0.0168483
## ASV248 (Comamonadaceae; NA)                       1.05099      0.0168483
##                                                    lfcSE      pvalue
##                                                <numeric>   <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)              0.716681 6.87643e-03
## ASV4 (Comamonadaceae; NA)                       1.051229 1.78480e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)            0.670480 2.09487e-15
## ASV6 (Rhizobiaceae; NA)                         0.782577 3.24644e-29
## ASV7 (Comamonadaceae; Candidatus Symbiobacter)  0.846666 7.64263e-13
## ...                                                  ...         ...
## ASV243 (NA; NA)                                 0.527531    0.999991
## ASV244 (NA; NA)                                 0.527531    0.999991
## ASV245 (NA; NA)                                 0.527531    0.999991
## ASV246 (NA; NA)                                 0.527531    0.999991
## ASV248 (Comamonadaceae; NA)                     0.527531    0.999991
##                                                       padj
##                                                  <numeric>
## ASV1 (Rhizobiaceae; Sinorhizobium)             1.40394e-02
## ASV4 (Comamonadaceae; NA)                      9.20581e-10
## ASV5 (Pseudomonadaceae; Pseudomonas)           1.57921e-14
## ASV6 (Rhizobiaceae; NA)                        3.18151e-27
## ASV7 (Comamonadaceae; Candidatus Symbiobacter) 4.40575e-12
## ...                                                    ...
## ASV243 (NA; NA)                                         NA
## ASV244 (NA; NA)                                         NA
## ASV245 (NA; NA)                                         NA
## ASV246 (NA; NA)                                         NA
## ASV248 (Comamonadaceae; NA)                             NA

Summarize results

summary(res_interact_Root_Sterile_vs_Root_Not, alpha = 0.05)
## 
## out of 204 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 7, 3.4%
## LFC < 0 (down)     : 53, 26%
## outliers [1]       : 0, 0%
## low counts [2]     : 106, 52%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Save results

res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not %>%
  as.data.frame() %>%
  rownames_to_column(var="ASV") %>% 
  as_tibble()
write.csv(res_interact_Root_Sterile_vs_Root_Not_tb, "./16S/test_results_Root_Sterile_vs_Root_NotSterile_shrunken_LFC.csv", row.names = F)
head(res_interact_Root_Sterile_vs_Root_Not_tb)
## # A tibble: 6 × 6
##   ASV                            baseMean log2FoldChange lfcSE   pvalue     padj
##   <chr>                             <dbl>          <dbl> <dbl>    <dbl>    <dbl>
## 1 ASV1 (Rhizobiaceae; Sinorhizo…  2154.         -0.173   0.717 6.88e- 3 1.40e- 2
## 2 ASV4 (Comamonadaceae; NA)         27.1        -6.83    1.05  1.78e-10 9.21e-10
## 3 ASV5 (Pseudomonadaceae; Pseud…     7.36        0.00649 0.670 2.09e-15 1.58e-14
## 4 ASV6 (Rhizobiaceae; NA)           39.2        -6.96    0.783 3.25e-29 3.18e-27
## 5 ASV7 (Comamonadaceae; Candida…     8.89       -6.45    0.847 7.64e-13 4.41e-12
## 6 ASV8 (Methylophilaceae; Methy…    27.8        -5.11    0.746 1.49e-17 1.62e-16

Extract significant results

sig_interact_Root_Sterile_vs_Root_Not <- res_interact_Root_Sterile_vs_Root_Not_tb %>% filter(padj < 0.05)
write.csv(sig_interact_Root_Sterile_vs_Root_Not, "./16S/significant_results_interact_Root_Sterile_vs_Root_Not_shrunken_LFC.csv", row.names = F)
sig_interact_Root_Sterile_vs_Root_Not
## # A tibble: 60 × 6
##    ASV                           baseMean log2FoldChange lfcSE   pvalue     padj
##    <chr>                            <dbl>          <dbl> <dbl>    <dbl>    <dbl>
##  1 ASV1 (Rhizobiaceae; Sinorhiz…  2154.         -0.173   0.717 6.88e- 3 1.40e- 2
##  2 ASV4 (Comamonadaceae; NA)        27.1        -6.83    1.05  1.78e-10 9.21e-10
##  3 ASV5 (Pseudomonadaceae; Pseu…     7.36        0.00649 0.670 2.09e-15 1.58e-14
##  4 ASV6 (Rhizobiaceae; NA)          39.2        -6.96    0.783 3.25e-29 3.18e-27
##  5 ASV7 (Comamonadaceae; Candid…     8.89       -6.45    0.847 7.64e-13 4.41e-12
##  6 ASV8 (Methylophilaceae; Meth…    27.8        -5.11    0.746 1.49e-17 1.62e-16
##  7 ASV9 (Rhizobiaceae; NA)          23.3        -6.15    0.852 6.85e-21 1.12e-19
##  8 ASV10 (Comamonadaceae; Azohy…     3.58       -6.19    0.739 2.32e- 9 9.88e- 9
##  9 ASV11 (Azospirillaceae; Azos…    21.8        -5.86    0.817 6.89e-18 8.44e-17
## 10 ASV13 (Rhizobiaceae; NA)          4.28       -5.60    0.729 5.99e- 9 2.17e- 8
## # ℹ 50 more rows
nrow(sig_interact_Root_Sterile_vs_Root_Not)
## [1] 60

Volcano plot

log2 fold changes of Root, Surface-sterile. vs Root, Not surface-sterile. contrasts

Obtain a logical vector where TRUE values denote padj values < 0.05 and fold change > 2 in either direction
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(threshold_TissueSurface = padj < 0.05 & abs(log2FoldChange) >= 1)
Make labels for the ASVs meeting the cutoff
res_interact_Root_Sterile_vs_Root_Not_tb$label <- res_interact_Root_Sterile_vs_Root_Not_tb$ASV
res_interact_Root_Sterile_vs_Root_Not_tb$label <- gsub("[A-Za-z]+; ", "", res_interact_Root_Sterile_vs_Root_Not_tb$label) # Use regex to remove family names from parentheses so it's shorter
# Apply the function made above to create a new_label column
res_interact_Root_Sterile_vs_Root_Not_tb$new_label <- mapply(modify_label, res_interact_Root_Sterile_vs_Root_Not_tb$ASV, res_interact_Root_Sterile_vs_Root_Not_tb$label)
## Remove the label if it does not meet the threshold
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label = case_when(threshold_TissueSurface == TRUE ~ c(new_label, NULL)))
# Make labels that is just the ASV name if it meets the threshold
res_interact_Root_Sterile_vs_Root_Not_tb$new_label_2 <- gsub(" \\(.*", "", res_interact_Root_Sterile_vs_Root_Not_tb$ASV)
## Remove the label if it does not meet the threshold
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label_2 = case_when(threshold_TissueSurface == TRUE ~ c(new_label_2, NULL)))
# Make labels just for the three ASVs of interest
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label_3 = case_when(ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)" ~ c("ASV5", NULL), ASV == "ASV28 (Bacillaceae; Bacillus)" ~ c("ASV28", NULL), ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)" ~ c("ASV1", NULL)))
# Add back label as in new_label_2 if it is an ASV of interest
res_interact_Root_Sterile_vs_Root_Not_tb$new_label_4 <- res_interact_Root_Sterile_vs_Root_Not_tb$new_label_2
res_interact_Root_Sterile_vs_Root_Not_tb <- res_interact_Root_Sterile_vs_Root_Not_tb %>% mutate(new_label_4 = ifelse(
    ASV == "ASV5 (Pseudomonadaceae; Pseudomonas)", 
    list("ASV5", new_label_2),
    ifelse(
      ASV == "ASV28 (Bacillaceae; Bacillus)", 
      list("ASV28", new_label_2),
      ifelse(
        ASV == "ASV1 (Rhizobiaceae; Sinorhizobium)",
        list("ASV1", new_label_2),
        new_label_2
      )
    )
  ))
Volcano plot
volcano.interact_RootSterile_vs_RootNot <- ggplot(res_interact_Root_Sterile_vs_Root_Not_tb, aes(x = log2FoldChange, y = -log10(padj))) +
                                          geom_point(aes(colour = threshold_TissueSurface)) +
                                          geom_text_repel(aes(label = new_label_3), max.overlaps = 20, min.segment.length = 0) +
                                          xlab("log2 fold change") + 
                                          ylab("-log10 adjusted p-value") +
                                          theme_linedraw() +
                                          ggtitle("Root, surface-sterilized vs root, not surface-sterilized") +
                                          theme(legend.position = "none", plot.title = element_text(size = 12))
volcano.interact_RootSterile_vs_RootNot
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).

Patch together plots

FigS1 <- cowplot::plot_grid(volcano.inoc_717A_vs_141, volcano.inoc_733B_vs_141, volcano.inoc_Nod_vs_Root, volcano.inoc_Sterile_vs_Not, volcano.interact_NodSterile_vs_NodNot, volcano.interact_RootSterile_vs_RootNot, ncol = 2, labels = "AUTO", align = "v", axis = "l", label_size = 24, label_fontfamily = "sans")
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 127 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 143 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 99 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
## Warning: Removed 106 rows containing missing values (`geom_point()`).
## Warning: Removed 201 rows containing missing values (`geom_text_repel()`).
FigS1

save

ggsave("./16S/FigS1.png", plot=FigS1, device = "png", width = 6.5, height = 6.5, units = "in", dpi = 300, scale = 2)

Plot heatmap using normalized count data

Get normalized counts, top 50 ASVs only

vsd_mat_fullmodel <- assay(vst(dds, blind=TRUE, nsub=sum(rowMeans(counts(dds))>5)))
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
vsd_mat_fullmodel <- vsd_mat_fullmodel[1:50,]

Set annotation column colors

mat_col <- data.frame(metadata[,1:3])
mat_col[colnames(vsd_mat_fullmodel), ] # reorder so it is the same as the vsd matrix
##              Inoculum Tissue                Surface
## 141_1_R_N         141   Root Not surface-sterilized
## 141_2_R_N         141   Root Not surface-sterilized
## 141_5_R_N         141   Root Not surface-sterilized
## 141_7_R_N         141   Root Not surface-sterilized
## 141_1_R_S         141   Root     Surface-sterilized
## 141_2_R_S         141   Root     Surface-sterilized
## 141_5_R_S         141   Root     Surface-sterilized
## 141_7_R_S         141   Root     Surface-sterilized
## 141_1_N_N         141 Nodule Not surface-sterilized
## 141_2_N_N         141 Nodule Not surface-sterilized
## 141_5_N_N         141 Nodule Not surface-sterilized
## 141_7_N_N         141 Nodule Not surface-sterilized
## 141_1_N_S         141 Nodule     Surface-sterilized
## 141_2_N_S         141 Nodule     Surface-sterilized
## 141_5_N_S         141 Nodule     Surface-sterilized
## 141_7_N_S         141 Nodule     Surface-sterilized
## 717_10_R_N 717A + 141   Root Not surface-sterilized
## 717_5_R_N  717A + 141   Root Not surface-sterilized
## 717_6_R_N  717A + 141   Root Not surface-sterilized
## 717_9_R_N  717A + 141   Root Not surface-sterilized
## 717_10_R_S 717A + 141   Root     Surface-sterilized
## 717_5_R_S  717A + 141   Root     Surface-sterilized
## 717_6_R_S  717A + 141   Root     Surface-sterilized
## 717_9_R_S  717A + 141   Root     Surface-sterilized
## 717_10_N_N 717A + 141 Nodule Not surface-sterilized
## 717_5_N_N  717A + 141 Nodule Not surface-sterilized
## 717_6_N_N  717A + 141 Nodule Not surface-sterilized
## 717_9_N_N  717A + 141 Nodule Not surface-sterilized
## 717_10_N_S 717A + 141 Nodule     Surface-sterilized
## 717_5_N_S  717A + 141 Nodule     Surface-sterilized
## 717_6_N_S  717A + 141 Nodule     Surface-sterilized
## 717_9_N_S  717A + 141 Nodule     Surface-sterilized
## 733_10_R_N 733B + 141   Root Not surface-sterilized
## 733_4_R_N  733B + 141   Root Not surface-sterilized
## 733_8_R_N  733B + 141   Root Not surface-sterilized
## 733_9_R_N  733B + 141   Root Not surface-sterilized
## 733_10_R_S 733B + 141   Root     Surface-sterilized
## 733_4_R_S  733B + 141   Root     Surface-sterilized
## 733_8_N_N  733B + 141   Root     Surface-sterilized
## 733_9_R_S  733B + 141   Root     Surface-sterilized
## 733_10_N_N 733B + 141 Nodule Not surface-sterilized
## 733_4_N_N  733B + 141 Nodule Not surface-sterilized
## 733_8_R_S  733B + 141 Nodule Not surface-sterilized
## 733_9_N_N  733B + 141 Nodule Not surface-sterilized
## 733_10_N_S 733B + 141 Nodule     Surface-sterilized
## 733_4_N_S  733B + 141 Nodule     Surface-sterilized
## 733_8_N_S  733B + 141 Nodule     Surface-sterilized
## 733_9_N_S  733B + 141 Nodule     Surface-sterilized
mat_colors <- list(Inoculum = brewer.pal(7, "Dark2")[1:3], Tissue = brewer.pal(7, "Dark2")[4:5], Surface = brewer.pal(7, "Dark2")[6:7])
names(mat_colors$Inoculum) <- unique(mat_col$Inoculum)
names(mat_colors$Tissue) <- unique(mat_col$Tissue)
names(mat_colors$Surface) <- unique(mat_col$Surface)
Fig4 <- ComplexHeatmap::pheatmap(
  mat                  = vsd_mat_fullmodel,
  cluster_cols         = TRUE,
  cluster_rows         = TRUE,
  color                = magma(8),
  border_color         = NA,
  show_colnames        = FALSE,
  show_rownames        = TRUE,
  annotation_col       = mat_col,
  annotation_colors    = mat_colors,
  drop_levels          = TRUE,
  fontsize             = 9,
  heatmap_legend_param = list(title = "Abundance\n(variance-stabilized)"),
)
Fig4

save

# Open a PNG graphics device
png(filename = "./16S/Fig4.png", width = 9, height = 6.5, units = "in", res = 300)
# Draw the heatmap
plot(Fig4)
# Close the device
dev.off()
## quartz_off_screen 
##                 2